summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/.gitignore3
-rw-r--r--src/Makefile64
-rw-r--r--src/Makefile.global.in100
-rw-r--r--src/Makefile.shlib15
-rw-r--r--src/backend/.gitignore2
-rw-r--r--src/backend/Makefile37
-rw-r--r--src/backend/access/Makefile2
-rw-r--r--src/backend/access/common/Makefile2
-rw-r--r--src/backend/access/common/heaptuple.c6
-rw-r--r--src/backend/access/common/indextuple.c6
-rw-r--r--src/backend/access/common/printtup.c4
-rw-r--r--src/backend/access/common/reloptions.c11
-rw-r--r--src/backend/access/common/scankey.c18
-rw-r--r--src/backend/access/common/tupconvert.c4
-rw-r--r--src/backend/access/common/tupdesc.c50
-rw-r--r--src/backend/access/gin/Makefile2
-rw-r--r--src/backend/access/gin/README204
-rw-r--r--src/backend/access/gin/ginarrayproc.c189
-rw-r--r--src/backend/access/gin/ginbtree.c59
-rw-r--r--src/backend/access/gin/ginbulk.c220
-rw-r--r--src/backend/access/gin/gindatapage.c94
-rw-r--r--src/backend/access/gin/ginentrypage.c363
-rw-r--r--src/backend/access/gin/ginfast.c193
-rw-r--r--src/backend/access/gin/ginget.c1186
-rw-r--r--src/backend/access/gin/gininsert.c356
-rw-r--r--src/backend/access/gin/ginscan.c479
-rw-r--r--src/backend/access/gin/ginutil.c412
-rw-r--r--src/backend/access/gin/ginvacuum.c57
-rw-r--r--src/backend/access/gin/ginxlog.c337
-rw-r--r--src/backend/access/gist/Makefile2
-rw-r--r--src/backend/access/gist/README299
-rw-r--r--src/backend/access/gist/gist.c1063
-rw-r--r--src/backend/access/gist/gistget.c838
-rw-r--r--src/backend/access/gist/gistproc.c113
-rw-r--r--src/backend/access/gist/gistscan.c222
-rw-r--r--src/backend/access/gist/gistsplit.c85
-rw-r--r--src/backend/access/gist/gistutil.c108
-rw-r--r--src/backend/access/gist/gistvacuum.c70
-rw-r--r--src/backend/access/gist/gistxlog.c804
-rw-r--r--src/backend/access/hash/Makefile2
-rw-r--r--src/backend/access/hash/README2
-rw-r--r--src/backend/access/hash/hash.c64
-rw-r--r--src/backend/access/hash/hashfunc.c4
-rw-r--r--src/backend/access/hash/hashinsert.c4
-rw-r--r--src/backend/access/hash/hashovfl.c13
-rw-r--r--src/backend/access/hash/hashpage.c26
-rw-r--r--src/backend/access/hash/hashscan.c4
-rw-r--r--src/backend/access/hash/hashsearch.c4
-rw-r--r--src/backend/access/hash/hashsort.c4
-rw-r--r--src/backend/access/hash/hashutil.c15
-rw-r--r--src/backend/access/heap/Makefile2
-rw-r--r--src/backend/access/heap/README.HOT8
-rw-r--r--src/backend/access/heap/heapam.c172
-rw-r--r--src/backend/access/heap/hio.c6
-rw-r--r--src/backend/access/heap/pruneheap.c7
-rw-r--r--src/backend/access/heap/rewriteheap.c25
-rw-r--r--src/backend/access/heap/syncscan.c4
-rw-r--r--src/backend/access/heap/tuptoaster.c4
-rw-r--r--src/backend/access/heap/visibilitymap.c9
-rw-r--r--src/backend/access/index/Makefile2
-rw-r--r--src/backend/access/index/genam.c37
-rw-r--r--src/backend/access/index/indexam.c80
-rw-r--r--src/backend/access/nbtree/Makefile2
-rw-r--r--src/backend/access/nbtree/README4
-rw-r--r--src/backend/access/nbtree/nbtcompare.c4
-rw-r--r--src/backend/access/nbtree/nbtinsert.c158
-rw-r--r--src/backend/access/nbtree/nbtpage.c76
-rw-r--r--src/backend/access/nbtree/nbtree.c75
-rw-r--r--src/backend/access/nbtree/nbtsearch.c26
-rw-r--r--src/backend/access/nbtree/nbtsort.c30
-rw-r--r--src/backend/access/nbtree/nbtutils.c36
-rw-r--r--src/backend/access/nbtree/nbtxlog.c23
-rw-r--r--src/backend/access/transam/Makefile2
-rw-r--r--src/backend/access/transam/README81
-rw-r--r--src/backend/access/transam/clog.c12
-rw-r--r--src/backend/access/transam/multixact.c12
-rw-r--r--src/backend/access/transam/recovery.conf.sample57
-rw-r--r--src/backend/access/transam/rmgr.c4
-rw-r--r--src/backend/access/transam/slru.c28
-rw-r--r--src/backend/access/transam/subtrans.c6
-rw-r--r--src/backend/access/transam/transam.c4
-rw-r--r--src/backend/access/transam/twophase.c64
-rw-r--r--src/backend/access/transam/twophase_rmgr.c9
-rw-r--r--src/backend/access/transam/varsup.c22
-rw-r--r--src/backend/access/transam/xact.c222
-rw-r--r--src/backend/access/transam/xlog.c1589
-rw-r--r--src/backend/access/transam/xlogutils.c20
-rw-r--r--src/backend/bootstrap/.cvsignore2
-rw-r--r--src/backend/bootstrap/.gitignore2
-rw-r--r--src/backend/bootstrap/Makefile4
-rw-r--r--src/backend/bootstrap/bootparse.y7
-rw-r--r--src/backend/bootstrap/bootscanner.l4
-rw-r--r--src/backend/bootstrap/bootstrap.c59
-rw-r--r--src/backend/catalog/.cvsignore4
-rw-r--r--src/backend/catalog/.gitignore4
-rw-r--r--src/backend/catalog/Catalog.pm4
-rw-r--r--src/backend/catalog/Makefile14
-rw-r--r--src/backend/catalog/README2
-rw-r--r--src/backend/catalog/aclchk.c286
-rw-r--r--src/backend/catalog/catalog.c152
-rw-r--r--src/backend/catalog/dependency.c283
-rw-r--r--src/backend/catalog/genbki.pl7
-rw-r--r--src/backend/catalog/heap.c235
-rw-r--r--src/backend/catalog/index.c808
-rw-r--r--src/backend/catalog/indexing.c4
-rw-r--r--src/backend/catalog/information_schema.sql200
-rw-r--r--src/backend/catalog/namespace.c432
-rw-r--r--src/backend/catalog/objectaddress.c887
-rw-r--r--src/backend/catalog/pg_aggregate.c4
-rw-r--r--src/backend/catalog/pg_collation.c180
-rw-r--r--src/backend/catalog/pg_constraint.c128
-rw-r--r--src/backend/catalog/pg_conversion.c12
-rw-r--r--src/backend/catalog/pg_db_role_setting.c4
-rw-r--r--src/backend/catalog/pg_depend.c164
-rw-r--r--src/backend/catalog/pg_enum.c418
-rw-r--r--src/backend/catalog/pg_inherits.c4
-rw-r--r--src/backend/catalog/pg_largeobject.c4
-rw-r--r--src/backend/catalog/pg_namespace.c19
-rw-r--r--src/backend/catalog/pg_operator.c18
-rw-r--r--src/backend/catalog/pg_proc.c53
-rw-r--r--src/backend/catalog/pg_shdepend.c14
-rw-r--r--src/backend/catalog/pg_type.c47
-rw-r--r--src/backend/catalog/sql_features.txt26
-rw-r--r--src/backend/catalog/storage.c126
-rw-r--r--src/backend/catalog/system_views.sql582
-rw-r--r--src/backend/catalog/toasting.c63
-rw-r--r--src/backend/commands/Makefile9
-rw-r--r--src/backend/commands/aggregatecmds.c10
-rw-r--r--src/backend/commands/alter.c296
-rw-r--r--src/backend/commands/analyze.c200
-rw-r--r--src/backend/commands/async.c22
-rw-r--r--src/backend/commands/cluster.c339
-rw-r--r--src/backend/commands/collationcmds.c443
-rw-r--r--src/backend/commands/comment.c1247
-rw-r--r--src/backend/commands/constraint.c4
-rw-r--r--src/backend/commands/conversioncmds.c88
-rw-r--r--src/backend/commands/copy.c1598
-rw-r--r--src/backend/commands/dbcommands.c163
-rw-r--r--src/backend/commands/define.c4
-rw-r--r--src/backend/commands/discard.c7
-rw-r--r--src/backend/commands/explain.c477
-rw-r--r--src/backend/commands/extension.c2763
-rw-r--r--src/backend/commands/foreigncmds.c330
-rw-r--r--src/backend/commands/functioncmds.c147
-rw-r--r--src/backend/commands/indexcmds.c315
-rw-r--r--src/backend/commands/lockcmds.c4
-rw-r--r--src/backend/commands/opclasscmds.c592
-rw-r--r--src/backend/commands/operatorcmds.c63
-rw-r--r--src/backend/commands/portalcmds.c15
-rw-r--r--src/backend/commands/prepare.c12
-rw-r--r--src/backend/commands/proclang.c61
-rw-r--r--src/backend/commands/schemacmds.c29
-rw-r--r--src/backend/commands/seclabel.c336
-rw-r--r--src/backend/commands/sequence.c302
-rw-r--r--src/backend/commands/tablecmds.c2403
-rw-r--r--src/backend/commands/tablespace.c176
-rw-r--r--src/backend/commands/trigger.c851
-rw-r--r--src/backend/commands/tsearchcmds.c266
-rw-r--r--src/backend/commands/typecmds.c225
-rw-r--r--src/backend/commands/user.c110
-rw-r--r--src/backend/commands/vacuum.c169
-rw-r--r--src/backend/commands/vacuumlazy.c205
-rw-r--r--src/backend/commands/variable.c935
-rw-r--r--src/backend/commands/view.c63
-rw-r--r--src/backend/common.mk8
-rw-r--r--src/backend/executor/Makefile6
-rw-r--r--src/backend/executor/README7
-rw-r--r--src/backend/executor/execAmi.c84
-rw-r--r--src/backend/executor/execCurrent.c27
-rw-r--r--src/backend/executor/execGrouping.c16
-rw-r--r--src/backend/executor/execJunk.c36
-rw-r--r--src/backend/executor/execMain.c554
-rw-r--r--src/backend/executor/execProcnode.c36
-rw-r--r--src/backend/executor/execQual.c367
-rw-r--r--src/backend/executor/execScan.c9
-rw-r--r--src/backend/executor/execTuples.c16
-rw-r--r--src/backend/executor/execUtils.c49
-rw-r--r--src/backend/executor/functions.c559
-rw-r--r--src/backend/executor/instrument.c4
-rw-r--r--src/backend/executor/nodeAgg.c43
-rw-r--r--src/backend/executor/nodeAppend.c14
-rw-r--r--src/backend/executor/nodeBitmapAnd.c13
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c32
-rw-r--r--src/backend/executor/nodeBitmapIndexscan.c67
-rw-r--r--src/backend/executor/nodeBitmapOr.c13
-rw-r--r--src/backend/executor/nodeCtescan.c8
-rw-r--r--src/backend/executor/nodeForeignscan.c209
-rw-r--r--src/backend/executor/nodeFunctionscan.c17
-rw-r--r--src/backend/executor/nodeGroup.c16
-rw-r--r--src/backend/executor/nodeHash.c159
-rw-r--r--src/backend/executor/nodeHashjoin.c621
-rw-r--r--src/backend/executor/nodeIndexscan.c234
-rw-r--r--src/backend/executor/nodeLimit.c80
-rw-r--r--src/backend/executor/nodeLockRows.c70
-rw-r--r--src/backend/executor/nodeMaterial.c24
-rw-r--r--src/backend/executor/nodeMergeAppend.c398
-rw-r--r--src/backend/executor/nodeMergejoin.c55
-rw-r--r--src/backend/executor/nodeModifyTable.c720
-rw-r--r--src/backend/executor/nodeNestloop.c66
-rw-r--r--src/backend/executor/nodeRecursiveunion.c19
-rw-r--r--src/backend/executor/nodeResult.c14
-rw-r--r--src/backend/executor/nodeSeqscan.c15
-rw-r--r--src/backend/executor/nodeSetOp.c14
-rw-r--r--src/backend/executor/nodeSort.c17
-rw-r--r--src/backend/executor/nodeSubplan.c48
-rw-r--r--src/backend/executor/nodeSubqueryscan.c12
-rw-r--r--src/backend/executor/nodeTidscan.c15
-rw-r--r--src/backend/executor/nodeUnique.c10
-rw-r--r--src/backend/executor/nodeValuesscan.c10
-rw-r--r--src/backend/executor/nodeWindowAgg.c31
-rw-r--r--src/backend/executor/nodeWorktablescan.c8
-rw-r--r--src/backend/executor/spi.c115
-rw-r--r--src/backend/executor/tstoreReceiver.c4
-rw-r--r--src/backend/foreign/Makefile2
-rw-r--r--src/backend/foreign/foreign.c243
-rw-r--r--src/backend/lib/Makefile2
-rw-r--r--src/backend/lib/dllist.c4
-rw-r--r--src/backend/lib/stringinfo.c4
-rw-r--r--src/backend/libpq/Makefile2
-rw-r--r--src/backend/libpq/README.SSL4
-rw-r--r--src/backend/libpq/auth.c525
-rw-r--r--src/backend/libpq/be-fsstubs.c11
-rw-r--r--src/backend/libpq/be-secure.c6
-rw-r--r--src/backend/libpq/crypt.c4
-rw-r--r--src/backend/libpq/hba.c245
-rw-r--r--src/backend/libpq/ip.c12
-rw-r--r--src/backend/libpq/md5.c4
-rw-r--r--src/backend/libpq/pg_hba.conf.sample39
-rw-r--r--src/backend/libpq/pqcomm.c276
-rw-r--r--src/backend/libpq/pqformat.c4
-rw-r--r--src/backend/libpq/pqsignal.c6
-rw-r--r--src/backend/main/Makefile2
-rw-r--r--src/backend/main/main.c62
-rw-r--r--src/backend/nls.mk9
-rw-r--r--src/backend/nodes/Makefile2
-rw-r--r--src/backend/nodes/README6
-rw-r--r--src/backend/nodes/bitmapset.c4
-rw-r--r--src/backend/nodes/copyfuncs.c374
-rw-r--r--src/backend/nodes/equalfuncs.c235
-rw-r--r--src/backend/nodes/list.c4
-rw-r--r--src/backend/nodes/makefuncs.c135
-rw-r--r--src/backend/nodes/nodeFuncs.c582
-rw-r--r--src/backend/nodes/nodes.c4
-rw-r--r--src/backend/nodes/outfuncs.c299
-rw-r--r--src/backend/nodes/params.c6
-rw-r--r--src/backend/nodes/print.c12
-rw-r--r--src/backend/nodes/read.c4
-rw-r--r--src/backend/nodes/readfuncs.c129
-rw-r--r--src/backend/nodes/tidbitmap.c4
-rw-r--r--src/backend/nodes/value.c4
-rw-r--r--src/backend/optimizer/Makefile2
-rw-r--r--src/backend/optimizer/README45
-rw-r--r--src/backend/optimizer/geqo/Makefile2
-rw-r--r--src/backend/optimizer/geqo/geqo_copy.c4
-rw-r--r--src/backend/optimizer/geqo/geqo_cx.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_erx.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_eval.c4
-rw-r--r--src/backend/optimizer/geqo/geqo_main.c16
-rw-r--r--src/backend/optimizer/geqo/geqo_misc.c4
-rw-r--r--src/backend/optimizer/geqo/geqo_mutation.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_ox1.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_ox2.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_pmx.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_pool.c4
-rw-r--r--src/backend/optimizer/geqo/geqo_px.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_random.c4
-rw-r--r--src/backend/optimizer/geqo/geqo_recombination.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_selection.c4
-rw-r--r--src/backend/optimizer/path/Makefile2
-rw-r--r--src/backend/optimizer/path/allpaths.c264
-rw-r--r--src/backend/optimizer/path/clausesel.c4
-rw-r--r--src/backend/optimizer/path/costsize.c448
-rw-r--r--src/backend/optimizer/path/equivclass.c215
-rw-r--r--src/backend/optimizer/path/indxpath.c506
-rw-r--r--src/backend/optimizer/path/joinpath.c142
-rw-r--r--src/backend/optimizer/path/joinrels.c80
-rw-r--r--src/backend/optimizer/path/orindxpath.c4
-rw-r--r--src/backend/optimizer/path/pathkeys.c427
-rw-r--r--src/backend/optimizer/path/tidpath.c4
-rw-r--r--src/backend/optimizer/plan/Makefile2
-rw-r--r--src/backend/optimizer/plan/README44
-rw-r--r--src/backend/optimizer/plan/analyzejoins.c110
-rw-r--r--src/backend/optimizer/plan/createplan.c715
-rw-r--r--src/backend/optimizer/plan/initsplan.c93
-rw-r--r--src/backend/optimizer/plan/planagg.c818
-rw-r--r--src/backend/optimizer/plan/planmain.c92
-rw-r--r--src/backend/optimizer/plan/planner.c277
-rw-r--r--src/backend/optimizer/plan/setrefs.c308
-rw-r--r--src/backend/optimizer/plan/subselect.c323
-rw-r--r--src/backend/optimizer/prep/Makefile2
-rw-r--r--src/backend/optimizer/prep/prepjointree.c187
-rw-r--r--src/backend/optimizer/prep/prepqual.c383
-rw-r--r--src/backend/optimizer/prep/preptlist.c92
-rw-r--r--src/backend/optimizer/prep/prepunion.c170
-rw-r--r--src/backend/optimizer/util/Makefile2
-rw-r--r--src/backend/optimizer/util/clauses.c520
-rw-r--r--src/backend/optimizer/util/joininfo.c38
-rw-r--r--src/backend/optimizer/util/pathnode.c178
-rw-r--r--src/backend/optimizer/util/placeholder.c242
-rw-r--r--src/backend/optimizer/util/plancat.c238
-rw-r--r--src/backend/optimizer/util/predtest.c29
-rw-r--r--src/backend/optimizer/util/relnode.c4
-rw-r--r--src/backend/optimizer/util/restrictinfo.c4
-rw-r--r--src/backend/optimizer/util/tlist.c46
-rw-r--r--src/backend/optimizer/util/var.c7
-rw-r--r--src/backend/parser/.cvsignore3
-rw-r--r--src/backend/parser/.gitignore3
-rw-r--r--src/backend/parser/Makefile8
-rw-r--r--src/backend/parser/README3
-rw-r--r--src/backend/parser/analyze.c492
-rw-r--r--src/backend/parser/gram.y1633
-rw-r--r--src/backend/parser/keywords.c4
-rw-r--r--src/backend/parser/kwlookup.c4
-rw-r--r--src/backend/parser/parse_agg.c74
-rw-r--r--src/backend/parser/parse_clause.c63
-rw-r--r--src/backend/parser/parse_coerce.c176
-rw-r--r--src/backend/parser/parse_collate.c773
-rw-r--r--src/backend/parser/parse_cte.c112
-rw-r--r--src/backend/parser/parse_expr.c298
-rw-r--r--src/backend/parser/parse_func.c51
-rw-r--r--src/backend/parser/parse_node.c64
-rw-r--r--src/backend/parser/parse_oper.c81
-rw-r--r--src/backend/parser/parse_param.c14
-rw-r--r--src/backend/parser/parse_relation.c102
-rw-r--r--src/backend/parser/parse_target.c197
-rw-r--r--src/backend/parser/parse_type.c115
-rw-r--r--src/backend/parser/parse_utilcmd.c542
-rw-r--r--src/backend/parser/parser.c4
-rw-r--r--src/backend/parser/scan.l12
-rw-r--r--src/backend/parser/scansup.c4
-rw-r--r--src/backend/pgxc/plan/planner.c7
-rw-r--r--src/backend/pgxc/pool/poolutils.c7
-rw-r--r--src/backend/pgxc/pool/postgresql_fdw.c4
-rw-r--r--src/backend/po/de.po20152
-rw-r--r--src/backend/po/es.po18769
-rw-r--r--src/backend/po/fr.po17210
-rw-r--r--src/backend/po/ja.po21406
-rw-r--r--src/backend/po/pt_BR.po24388
-rw-r--r--src/backend/po/zh_CN.po17232
-rw-r--r--src/backend/po/zh_TW.po22318
-rw-r--r--src/backend/port/.gitignore5
-rw-r--r--src/backend/port/Makefile14
-rwxr-xr-xsrc/backend/port/aix/mkldexport.sh6
-rw-r--r--src/backend/port/darwin/Makefile2
-rw-r--r--src/backend/port/darwin/README4
-rw-r--r--src/backend/port/darwin/system.c2
-rw-r--r--src/backend/port/dynloader/aix.c2
-rw-r--r--src/backend/port/dynloader/aix.h4
-rw-r--r--src/backend/port/dynloader/bsdi.c4
-rw-r--r--src/backend/port/dynloader/bsdi.h4
-rw-r--r--src/backend/port/dynloader/cygwin.c2
-rw-r--r--src/backend/port/dynloader/cygwin.h4
-rw-r--r--src/backend/port/dynloader/darwin.c4
-rw-r--r--src/backend/port/dynloader/darwin.h2
-rw-r--r--src/backend/port/dynloader/dgux.c2
-rw-r--r--src/backend/port/dynloader/dgux.h4
-rw-r--r--src/backend/port/dynloader/freebsd.c15
-rw-r--r--src/backend/port/dynloader/freebsd.h4
-rw-r--r--src/backend/port/dynloader/hpux.c4
-rw-r--r--src/backend/port/dynloader/hpux.h4
-rw-r--r--src/backend/port/dynloader/irix.c2
-rw-r--r--src/backend/port/dynloader/irix.h4
-rw-r--r--src/backend/port/dynloader/linux.c4
-rw-r--r--src/backend/port/dynloader/linux.h4
-rw-r--r--src/backend/port/dynloader/netbsd.c9
-rw-r--r--src/backend/port/dynloader/netbsd.h4
-rw-r--r--src/backend/port/dynloader/nextstep.c2
-rw-r--r--src/backend/port/dynloader/nextstep.h2
-rw-r--r--src/backend/port/dynloader/openbsd.c16
-rw-r--r--src/backend/port/dynloader/openbsd.h4
-rw-r--r--src/backend/port/dynloader/osf.c2
-rw-r--r--src/backend/port/dynloader/osf.h4
-rw-r--r--src/backend/port/dynloader/sco.c2
-rw-r--r--src/backend/port/dynloader/sco.h4
-rw-r--r--src/backend/port/dynloader/solaris.c2
-rw-r--r--src/backend/port/dynloader/solaris.h4
-rw-r--r--src/backend/port/dynloader/sunos4.c2
-rw-r--r--src/backend/port/dynloader/sunos4.h4
-rw-r--r--src/backend/port/dynloader/svr4.c2
-rw-r--r--src/backend/port/dynloader/svr4.h4
-rw-r--r--src/backend/port/dynloader/ultrix4.c4
-rw-r--r--src/backend/port/dynloader/ultrix4.h4
-rw-r--r--src/backend/port/dynloader/univel.c2
-rw-r--r--src/backend/port/dynloader/univel.h4
-rw-r--r--src/backend/port/dynloader/unixware.c2
-rw-r--r--src/backend/port/dynloader/unixware.h4
-rw-r--r--src/backend/port/dynloader/win32.c2
-rw-r--r--src/backend/port/dynloader/win32.h2
-rw-r--r--src/backend/port/ipc_test.c8
-rw-r--r--src/backend/port/nextstep/Makefile2
-rw-r--r--src/backend/port/nextstep/port.c2
-rw-r--r--src/backend/port/pipe.c (renamed from src/port/pipe.c)12
-rw-r--r--src/backend/port/posix_sema.c4
-rw-r--r--src/backend/port/sysv_sema.c12
-rw-r--r--src/backend/port/sysv_shmem.c50
-rw-r--r--src/backend/port/tas/sunstudio_sparc.s8
-rw-r--r--src/backend/port/tas/sunstudio_x86.s4
-rw-r--r--src/backend/port/unix_latch.c432
-rw-r--r--src/backend/port/win32/Makefile5
-rw-r--r--src/backend/port/win32/crashdump.c170
-rw-r--r--src/backend/port/win32/mingwcompat.c4
-rw-r--r--src/backend/port/win32/security.c10
-rw-r--r--src/backend/port/win32/signal.c4
-rw-r--r--src/backend/port/win32/socket.c31
-rw-r--r--src/backend/port/win32/timer.c6
-rw-r--r--src/backend/port/win32_latch.c216
-rw-r--r--src/backend/port/win32_sema.c4
-rw-r--r--src/backend/port/win32_shmem.c4
-rw-r--r--src/backend/postmaster/Makefile2
-rw-r--r--src/backend/postmaster/autovacuum.c118
-rw-r--r--src/backend/postmaster/bgwriter.c166
-rw-r--r--src/backend/postmaster/fork_process.c4
-rw-r--r--src/backend/postmaster/pgarch.c4
-rw-r--r--src/backend/postmaster/pgstat.c222
-rw-r--r--src/backend/postmaster/postmaster.c239
-rw-r--r--src/backend/postmaster/syslogger.c111
-rw-r--r--src/backend/postmaster/walwriter.c4
-rw-r--r--src/backend/regex/Makefile5
-rw-r--r--src/backend/regex/regc_color.c2
-rw-r--r--src/backend/regex/regc_cvec.c2
-rw-r--r--src/backend/regex/regc_lex.c4
-rw-r--r--src/backend/regex/regc_locale.c167
-rw-r--r--src/backend/regex/regc_nfa.c2
-rw-r--r--src/backend/regex/regc_pg_locale.c658
-rw-r--r--src/backend/regex/regcomp.c20
-rw-r--r--src/backend/regex/rege_dfa.c2
-rw-r--r--src/backend/regex/regerror.c2
-rw-r--r--src/backend/regex/regexec.c7
-rw-r--r--src/backend/regex/regfree.c2
-rw-r--r--src/backend/replication/.gitignore3
-rw-r--r--src/backend/replication/Makefile27
-rw-r--r--src/backend/replication/README33
-rw-r--r--src/backend/replication/basebackup.c866
-rw-r--r--src/backend/replication/libpqwalreceiver/Makefile5
-rw-r--r--src/backend/replication/libpqwalreceiver/libpqwalreceiver.c69
-rw-r--r--src/backend/replication/repl_gram.y169
-rw-r--r--src/backend/replication/repl_scanner.l171
-rw-r--r--src/backend/replication/syncrep.c675
-rw-r--r--src/backend/replication/walreceiver.c191
-rw-r--r--src/backend/replication/walreceiverfuncs.c17
-rw-r--r--src/backend/replication/walsender.c986
-rw-r--r--src/backend/rewrite/Makefile2
-rw-r--r--src/backend/rewrite/rewriteDefine.c47
-rw-r--r--src/backend/rewrite/rewriteHandler.c412
-rw-r--r--src/backend/rewrite/rewriteManip.c12
-rw-r--r--src/backend/rewrite/rewriteRemove.c4
-rw-r--r--src/backend/rewrite/rewriteSupport.c88
-rw-r--r--src/backend/snowball/.gitignore1
-rw-r--r--src/backend/snowball/Makefile4
-rw-r--r--src/backend/snowball/README2
-rw-r--r--src/backend/snowball/dict_snowball.c4
-rw-r--r--src/backend/snowball/snowball.sql.in3
-rw-r--r--src/backend/snowball/snowball_func.sql.in3
-rw-r--r--src/backend/snowball/stopwords/english.stop1
-rw-r--r--src/backend/storage/Makefile2
-rw-r--r--src/backend/storage/buffer/Makefile2
-rw-r--r--src/backend/storage/buffer/README4
-rw-r--r--src/backend/storage/buffer/buf_init.c4
-rw-r--r--src/backend/storage/buffer/buf_table.c4
-rw-r--r--src/backend/storage/buffer/bufmgr.c211
-rw-r--r--src/backend/storage/buffer/freelist.c6
-rw-r--r--src/backend/storage/buffer/localbuf.c58
-rw-r--r--src/backend/storage/file/Makefile4
-rw-r--r--src/backend/storage/file/buffile.c4
-rw-r--r--src/backend/storage/file/copydir.c10
-rw-r--r--src/backend/storage/file/fd.c287
-rw-r--r--src/backend/storage/file/reinit.c400
-rw-r--r--src/backend/storage/freespace/Makefile2
-rw-r--r--src/backend/storage/freespace/README5
-rw-r--r--src/backend/storage/freespace/freespace.c8
-rw-r--r--src/backend/storage/freespace/fsmpage.c4
-rw-r--r--src/backend/storage/freespace/indexfsm.c4
-rw-r--r--src/backend/storage/ipc/Makefile2
-rw-r--r--src/backend/storage/ipc/README4
-rw-r--r--src/backend/storage/ipc/ipc.c8
-rw-r--r--src/backend/storage/ipc/ipci.c11
-rw-r--r--src/backend/storage/ipc/pmsignal.c55
-rw-r--r--src/backend/storage/ipc/procarray.c285
-rw-r--r--src/backend/storage/ipc/procsignal.c9
-rw-r--r--src/backend/storage/ipc/shmem.c6
-rw-r--r--src/backend/storage/ipc/shmqueue.c33
-rw-r--r--src/backend/storage/ipc/sinval.c4
-rw-r--r--src/backend/storage/ipc/sinvaladt.c4
-rw-r--r--src/backend/storage/ipc/standby.c115
-rw-r--r--src/backend/storage/large_object/Makefile2
-rw-r--r--src/backend/storage/large_object/inv_api.c30
-rw-r--r--src/backend/storage/lmgr/Makefile6
-rw-r--r--src/backend/storage/lmgr/README21
-rw-r--r--src/backend/storage/lmgr/README-SSI585
-rw-r--r--src/backend/storage/lmgr/deadlock.c4
-rw-r--r--src/backend/storage/lmgr/lmgr.c7
-rw-r--r--src/backend/storage/lmgr/lock.c137
-rw-r--r--src/backend/storage/lmgr/lwlock.c8
-rw-r--r--src/backend/storage/lmgr/predicate.c4768
-rw-r--r--src/backend/storage/lmgr/proc.c29
-rw-r--r--src/backend/storage/lmgr/s_lock.c4
-rw-r--r--src/backend/storage/lmgr/spin.c4
-rw-r--r--src/backend/storage/page/Makefile2
-rw-r--r--src/backend/storage/page/bufpage.c4
-rw-r--r--src/backend/storage/page/itemptr.c4
-rw-r--r--src/backend/storage/smgr/Makefile2
-rw-r--r--src/backend/storage/smgr/README2
-rw-r--r--src/backend/storage/smgr/md.c154
-rw-r--r--src/backend/storage/smgr/smgr.c86
-rw-r--r--src/backend/storage/smgr/smgrtype.c4
-rw-r--r--src/backend/tcop/Makefile2
-rw-r--r--src/backend/tcop/dest.c4
-rw-r--r--src/backend/tcop/fastpath.c21
-rw-r--r--src/backend/tcop/postgres.c193
-rw-r--r--src/backend/tcop/pquery.c114
-rw-r--r--src/backend/tcop/utility.c430
-rw-r--r--src/backend/tsearch/Makefile4
-rw-r--r--src/backend/tsearch/dict.c4
-rw-r--r--src/backend/tsearch/dict_ispell.c9
-rw-r--r--src/backend/tsearch/dict_simple.c4
-rw-r--r--src/backend/tsearch/dict_synonym.c4
-rw-r--r--src/backend/tsearch/dict_thesaurus.c6
-rw-r--r--src/backend/tsearch/regis.c8
-rw-r--r--src/backend/tsearch/spell.c173
-rw-r--r--src/backend/tsearch/thesaurus_sample.ths1
-rw-r--r--src/backend/tsearch/to_tsany.c4
-rw-r--r--src/backend/tsearch/ts_locale.c40
-rw-r--r--src/backend/tsearch/ts_parse.c4
-rw-r--r--src/backend/tsearch/ts_selfuncs.c193
-rw-r--r--src/backend/tsearch/ts_typanalyze.c4
-rw-r--r--src/backend/tsearch/ts_utils.c4
-rw-r--r--src/backend/tsearch/wparser.c10
-rw-r--r--src/backend/tsearch/wparser_def.c763
-rw-r--r--src/backend/utils/.cvsignore2
-rw-r--r--src/backend/utils/.gitignore4
-rw-r--r--src/backend/utils/Gen_dummy_probes.sed4
-rw-r--r--src/backend/utils/Gen_fmgrtab.pl10
-rw-r--r--src/backend/utils/Makefile11
-rw-r--r--src/backend/utils/adt/Makefile4
-rw-r--r--src/backend/utils/adt/acl.c141
-rw-r--r--src/backend/utils/adt/array_userfuncs.c11
-rw-r--r--src/backend/utils/adt/arrayfuncs.c482
-rw-r--r--src/backend/utils/adt/arrayutils.c6
-rw-r--r--src/backend/utils/adt/ascii.c4
-rw-r--r--src/backend/utils/adt/bool.c4
-rw-r--r--src/backend/utils/adt/cash.c161
-rw-r--r--src/backend/utils/adt/char.c4
-rw-r--r--src/backend/utils/adt/date.c39
-rw-r--r--src/backend/utils/adt/datetime.c162
-rw-r--r--src/backend/utils/adt/datum.c4
-rw-r--r--src/backend/utils/adt/dbsize.c101
-rw-r--r--src/backend/utils/adt/domains.c4
-rw-r--r--src/backend/utils/adt/encode.c4
-rw-r--r--src/backend/utils/adt/enum.c263
-rw-r--r--src/backend/utils/adt/float.c4
-rw-r--r--src/backend/utils/adt/format_type.c30
-rw-r--r--src/backend/utils/adt/formatting.c389
-rw-r--r--src/backend/utils/adt/genfile.c209
-rw-r--r--src/backend/utils/adt/geo_ops.c89
-rw-r--r--src/backend/utils/adt/geo_selfuncs.c4
-rw-r--r--src/backend/utils/adt/inet_cidr_ntop.c (renamed from src/backend/utils/adt/inet_net_ntop.c)237
-rw-r--r--src/backend/utils/adt/inet_net_pton.c2
-rw-r--r--src/backend/utils/adt/int.c28
-rw-r--r--src/backend/utils/adt/int8.c29
-rw-r--r--src/backend/utils/adt/like.c102
-rw-r--r--src/backend/utils/adt/like_match.c16
-rw-r--r--src/backend/utils/adt/lockfuncs.c240
-rw-r--r--src/backend/utils/adt/mac.c2
-rw-r--r--src/backend/utils/adt/misc.c4
-rw-r--r--src/backend/utils/adt/nabstime.c33
-rw-r--r--src/backend/utils/adt/name.c4
-rw-r--r--src/backend/utils/adt/network.c62
-rw-r--r--src/backend/utils/adt/numeric.c281
-rw-r--r--src/backend/utils/adt/numutils.c124
-rw-r--r--src/backend/utils/adt/oid.c7
-rw-r--r--src/backend/utils/adt/oracle_compat.c13
-rw-r--r--src/backend/utils/adt/pg_locale.c660
-rw-r--r--src/backend/utils/adt/pg_lzcompress.c4
-rw-r--r--src/backend/utils/adt/pgstatfuncs.c457
-rw-r--r--src/backend/utils/adt/pseudotypes.c114
-rw-r--r--src/backend/utils/adt/quote.c79
-rw-r--r--src/backend/utils/adt/regexp.c46
-rw-r--r--src/backend/utils/adt/regproc.c8
-rw-r--r--src/backend/utils/adt/ri_triggers.c126
-rw-r--r--src/backend/utils/adt/rowtypes.c26
-rw-r--r--src/backend/utils/adt/ruleutils.c841
-rw-r--r--src/backend/utils/adt/selfuncs.c863
-rw-r--r--src/backend/utils/adt/tid.c4
-rw-r--r--src/backend/utils/adt/timestamp.c8
-rw-r--r--src/backend/utils/adt/trigfuncs.c4
-rw-r--r--src/backend/utils/adt/tsginidx.c118
-rw-r--r--src/backend/utils/adt/tsgistidx.c4
-rw-r--r--src/backend/utils/adt/tsquery.c8
-rw-r--r--src/backend/utils/adt/tsquery_cleanup.c4
-rw-r--r--src/backend/utils/adt/tsquery_gist.c4
-rw-r--r--src/backend/utils/adt/tsquery_op.c20
-rw-r--r--src/backend/utils/adt/tsquery_rewrite.c4
-rw-r--r--src/backend/utils/adt/tsquery_util.c17
-rw-r--r--src/backend/utils/adt/tsrank.c10
-rw-r--r--src/backend/utils/adt/tsvector.c4
-rw-r--r--src/backend/utils/adt/tsvector_op.c88
-rw-r--r--src/backend/utils/adt/tsvector_parser.c4
-rw-r--r--src/backend/utils/adt/txid.c4
-rw-r--r--src/backend/utils/adt/uuid.c4
-rw-r--r--src/backend/utils/adt/varbit.c65
-rw-r--r--src/backend/utils/adt/varchar.c31
-rw-r--r--src/backend/utils/adt/varlena.c863
-rw-r--r--src/backend/utils/adt/version.c4
-rw-r--r--src/backend/utils/adt/windowfuncs.c4
-rw-r--r--src/backend/utils/adt/xid.c4
-rw-r--r--src/backend/utils/adt/xml.c275
-rw-r--r--src/backend/utils/cache/Makefile2
-rw-r--r--src/backend/utils/cache/attoptcache.c4
-rw-r--r--src/backend/utils/cache/catcache.c15
-rw-r--r--src/backend/utils/cache/inval.c23
-rw-r--r--src/backend/utils/cache/lsyscache.c296
-rw-r--r--src/backend/utils/cache/plancache.c9
-rw-r--r--src/backend/utils/cache/relcache.c288
-rw-r--r--src/backend/utils/cache/relmapper.c4
-rw-r--r--src/backend/utils/cache/spccache.c4
-rw-r--r--src/backend/utils/cache/syscache.c45
-rw-r--r--src/backend/utils/cache/ts_cache.c40
-rw-r--r--src/backend/utils/cache/typcache.c784
-rw-r--r--src/backend/utils/errcodes.txt455
-rw-r--r--src/backend/utils/error/Makefile2
-rw-r--r--src/backend/utils/error/assert.c4
-rw-r--r--src/backend/utils/error/elog.c147
-rw-r--r--src/backend/utils/fmgr/Makefile2
-rw-r--r--src/backend/utils/fmgr/README20
-rw-r--r--src/backend/utils/fmgr/dfmgr.c8
-rw-r--r--src/backend/utils/fmgr/fmgr.c251
-rw-r--r--src/backend/utils/fmgr/funcapi.c25
-rw-r--r--src/backend/utils/generate-errcodes.pl41
-rw-r--r--src/backend/utils/hash/Makefile2
-rw-r--r--src/backend/utils/hash/dynahash.c10
-rw-r--r--src/backend/utils/hash/hashfn.c4
-rw-r--r--src/backend/utils/hash/pg_crc.c4
-rw-r--r--src/backend/utils/init/Makefile2
-rw-r--r--src/backend/utils/init/globals.c12
-rw-r--r--src/backend/utils/init/miscinit.c148
-rw-r--r--src/backend/utils/init/postinit.c196
-rw-r--r--src/backend/utils/mb/Makefile2
-rw-r--r--src/backend/utils/mb/README2
-rw-r--r--src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT1
-rw-r--r--src/backend/utils/mb/Unicode/Makefile4
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_BIG5.pl5
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl6
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl14
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl6
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl6
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl6
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_GB18030.pl4
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl12
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SJIS.pl6
-rw-r--r--src/backend/utils/mb/Unicode/UCS_to_most.pl4
-rw-r--r--src/backend/utils/mb/Unicode/euc_cn_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/euc_jp_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/euc_tw_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/gb18030_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/gbk_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_10_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_13_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_14_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_15_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_16_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_2_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_3_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_4_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_5_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_6_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_7_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_8_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/iso8859_9_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/koi8r_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/ucs2utf.pl8
-rw-r--r--src/backend/utils/mb/conv.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/.gitignore1
-rw-r--r--src/backend/utils/mb/conversion_procs/Makefile9
-rw-r--r--src/backend/utils/mb/conversion_procs/ascii_and_mic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/ascii_and_mic/ascii_and_mic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/cyrillic_and_mic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/euc2004_sjis2004/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_cn_and_mic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_kr_and_mic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_tw_and_big5/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_tw_and_big5/big5.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/latin2_and_win1250/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/latin_and_mic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_ascii/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_big5/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc2004/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gb18030/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gbk/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_johab/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_uhc/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c4
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_win/Makefile2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c4
-rw-r--r--src/backend/utils/mb/encnames.c2
-rw-r--r--src/backend/utils/mb/iso.c2
-rw-r--r--src/backend/utils/mb/mbutils.c357
-rw-r--r--src/backend/utils/mb/wchar.c20
-rw-r--r--src/backend/utils/mb/win1251.c2
-rw-r--r--src/backend/utils/mb/win866.c2
-rw-r--r--src/backend/utils/mb/wstrcmp.c2
-rw-r--r--src/backend/utils/mb/wstrncmp.c2
-rw-r--r--src/backend/utils/misc/.cvsignore1
-rw-r--r--src/backend/utils/misc/.gitignore1
-rw-r--r--src/backend/utils/misc/Makefile4
-rw-r--r--src/backend/utils/misc/README205
-rwxr-xr-xsrc/backend/utils/misc/check_guc24
-rw-r--r--src/backend/utils/misc/guc-file.l219
-rw-r--r--src/backend/utils/misc/guc.c2640
-rw-r--r--src/backend/utils/misc/help_config.c4
-rw-r--r--src/backend/utils/misc/pg_rusage.c4
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample63
-rw-r--r--src/backend/utils/misc/ps_status.c6
-rw-r--r--src/backend/utils/misc/rbtree.c464
-rw-r--r--src/backend/utils/misc/superuser.c4
-rw-r--r--src/backend/utils/misc/tzparser.c156
-rw-r--r--src/backend/utils/mmgr/Makefile2
-rw-r--r--src/backend/utils/mmgr/README4
-rw-r--r--src/backend/utils/mmgr/aset.c66
-rw-r--r--src/backend/utils/mmgr/mcxt.c23
-rw-r--r--src/backend/utils/mmgr/portalmem.c222
-rw-r--r--src/backend/utils/probes.d16
-rw-r--r--src/backend/utils/resowner/Makefile2
-rw-r--r--src/backend/utils/resowner/README2
-rw-r--r--src/backend/utils/resowner/resowner.c10
-rw-r--r--src/backend/utils/sort/Makefile2
-rw-r--r--src/backend/utils/sort/logtape.c4
-rw-r--r--src/backend/utils/sort/tuplesort.c419
-rw-r--r--src/backend/utils/sort/tuplestore.c56
-rw-r--r--src/backend/utils/time/Makefile2
-rw-r--r--src/backend/utils/time/combocid.c4
-rw-r--r--src/backend/utils/time/snapmgr.c86
-rw-r--r--src/backend/utils/time/tqual.c24
-rw-r--r--src/bcc32.mak10
-rw-r--r--src/bin/Makefile9
-rw-r--r--src/bin/initdb/.gitignore4
-rw-r--r--src/bin/initdb/Makefile8
-rw-r--r--src/bin/initdb/initdb.c548
-rw-r--r--src/bin/initdb/nls.mk4
-rw-r--r--src/bin/initdb/po/cs.po4
-rw-r--r--src/bin/initdb/po/de.po319
-rw-r--r--src/bin/initdb/po/es.po294
-rw-r--r--src/bin/initdb/po/fr.po6
-rw-r--r--src/bin/initdb/po/it.po22
-rw-r--r--src/bin/initdb/po/ja.po301
-rw-r--r--src/bin/initdb/po/ko.po809
-rw-r--r--src/bin/initdb/po/pt_BR.po4
-rw-r--r--src/bin/initdb/po/ro.po845
-rw-r--r--src/bin/initdb/po/ru.po748
-rw-r--r--src/bin/initdb/po/sv.po295
-rw-r--r--src/bin/initdb/po/tr.po346
-rw-r--r--src/bin/initdb/po/zh_CN.po636
-rw-r--r--src/bin/initdb/po/zh_TW.po875
-rw-r--r--src/bin/pg_basebackup/.gitignore1
-rw-r--r--src/bin/pg_basebackup/Makefile38
-rw-r--r--src/bin/pg_basebackup/nls.mk5
-rw-r--r--src/bin/pg_basebackup/pg_basebackup.c1124
-rw-r--r--src/bin/pg_basebackup/po/de.po365
-rw-r--r--src/bin/pg_basebackup/po/es.po353
-rw-r--r--src/bin/pg_config/.gitignore1
-rw-r--r--src/bin/pg_config/Makefile8
-rw-r--r--src/bin/pg_config/nls.mk4
-rw-r--r--src/bin/pg_config/pg_config.c4
-rw-r--r--src/bin/pg_config/po/es.po111
-rw-r--r--src/bin/pg_config/po/fr.po108
-rw-r--r--src/bin/pg_config/po/ja.po112
-rw-r--r--src/bin/pg_config/po/ko.po126
-rw-r--r--src/bin/pg_config/po/pt_BR.po77
-rw-r--r--src/bin/pg_config/po/ro.po219
-rw-r--r--src/bin/pg_config/po/ru.po274
-rw-r--r--src/bin/pg_config/po/sv.po102
-rw-r--r--src/bin/pg_config/po/ta.po2
-rw-r--r--src/bin/pg_config/po/tr.po105
-rw-r--r--src/bin/pg_config/po/zh_CN.po246
-rw-r--r--src/bin/pg_config/po/zh_TW.po304
-rw-r--r--src/bin/pg_controldata/.gitignore3
-rw-r--r--src/bin/pg_controldata/Makefile8
-rw-r--r--src/bin/pg_controldata/nls.mk4
-rw-r--r--src/bin/pg_controldata/pg_controldata.c2
-rw-r--r--src/bin/pg_controldata/po/es.po110
-rw-r--r--src/bin/pg_controldata/po/fr.po2
-rw-r--r--src/bin/pg_controldata/po/ja.po151
-rw-r--r--src/bin/pg_controldata/po/ko.po139
-rw-r--r--src/bin/pg_controldata/po/pt_BR.po9
-rw-r--r--src/bin/pg_controldata/po/ro.po341
-rw-r--r--src/bin/pg_controldata/po/ru.po365
-rw-r--r--src/bin/pg_controldata/po/tr.po155
-rw-r--r--src/bin/pg_controldata/po/zh_CN.po283
-rw-r--r--src/bin/pg_controldata/po/zh_TW.po338
-rw-r--r--src/bin/pg_ctl/.gitignore1
-rw-r--r--src/bin/pg_ctl/Makefile8
-rw-r--r--src/bin/pg_ctl/nls.mk4
-rw-r--r--src/bin/pg_ctl/pg_ctl.c568
-rw-r--r--src/bin/pg_ctl/po/de.po399
-rw-r--r--src/bin/pg_ctl/po/fr.po2
-rw-r--r--src/bin/pg_ctl/po/ja.po267
-rw-r--r--src/bin/pg_ctl/po/ru.po628
-rw-r--r--src/bin/pg_ctl/po/tr.po299
-rw-r--r--src/bin/pg_ctl/po/zh_CN.po533
-rw-r--r--src/bin/pg_ctl/po/zh_TW.po771
-rw-r--r--src/bin/pg_dump/.gitignore5
-rw-r--r--src/bin/pg_dump/Makefile14
-rw-r--r--src/bin/pg_dump/README8
-rw-r--r--src/bin/pg_dump/common.c92
-rw-r--r--src/bin/pg_dump/compress_io.c684
-rw-r--r--src/bin/pg_dump/compress_io.h70
-rw-r--r--src/bin/pg_dump/dumputils.c13
-rw-r--r--src/bin/pg_dump/dumputils.h6
-rw-r--r--src/bin/pg_dump/keywords.c4
-rw-r--r--src/bin/pg_dump/nls.mk4
-rw-r--r--src/bin/pg_dump/pg_backup.h10
-rw-r--r--src/bin/pg_dump/pg_backup_archiver.c352
-rw-r--r--src/bin/pg_dump/pg_backup_archiver.h27
-rw-r--r--src/bin/pg_dump/pg_backup_custom.c438
-rw-r--r--src/bin/pg_dump/pg_backup_db.c2
-rw-r--r--src/bin/pg_dump/pg_backup_db.h2
-rw-r--r--src/bin/pg_dump/pg_backup_directory.c683
-rw-r--r--src/bin/pg_dump/pg_backup_files.c2
-rw-r--r--src/bin/pg_dump/pg_backup_null.c2
-rw-r--r--src/bin/pg_dump/pg_backup_tar.c10
-rw-r--r--src/bin/pg_dump/pg_backup_tar.h2
-rw-r--r--src/bin/pg_dump/pg_dump.c2670
-rw-r--r--src/bin/pg_dump/pg_dump.h35
-rw-r--r--src/bin/pg_dump/pg_dump_sort.c80
-rw-r--r--src/bin/pg_dump/pg_dumpall.c144
-rw-r--r--src/bin/pg_dump/pg_restore.c59
-rw-r--r--src/bin/pg_dump/po/de.po985
-rw-r--r--src/bin/pg_dump/po/es.po883
-rw-r--r--src/bin/pg_dump/po/fr.po38
-rw-r--r--src/bin/pg_dump/po/it.po6
-rw-r--r--src/bin/pg_dump/po/ja.po953
-rw-r--r--src/bin/pg_dump/po/ko.po2147
-rw-r--r--src/bin/pg_dump/po/pt_BR.po744
-rw-r--r--src/bin/pg_dump/po/tr.po1155
-rw-r--r--src/bin/pg_dump/po/zh_CN.po2141
-rw-r--r--src/bin/pg_dump/po/zh_TW.po2213
-rw-r--r--src/bin/pg_resetxlog/.gitignore3
-rw-r--r--src/bin/pg_resetxlog/Makefile8
-rw-r--r--src/bin/pg_resetxlog/nls.mk4
-rw-r--r--src/bin/pg_resetxlog/pg_resetxlog.c4
-rw-r--r--src/bin/pg_resetxlog/po/es.po165
-rw-r--r--src/bin/pg_resetxlog/po/fr.po2
-rw-r--r--src/bin/pg_resetxlog/po/ja.po176
-rw-r--r--src/bin/pg_resetxlog/po/ko.po227
-rw-r--r--src/bin/pg_resetxlog/po/pt_BR.po7
-rw-r--r--src/bin/pg_resetxlog/po/ro.po420
-rw-r--r--src/bin/pg_resetxlog/po/ru.po2
-rw-r--r--src/bin/pg_resetxlog/po/tr.po215
-rw-r--r--src/bin/pg_resetxlog/po/zh_CN.po354
-rw-r--r--src/bin/pg_resetxlog/po/zh_TW.po475
-rw-r--r--src/bin/pgevent/Makefile4
-rw-r--r--src/bin/pgevent/README2
-rw-r--r--src/bin/pgevent/pgevent.c2
-rw-r--r--src/bin/pgevent/pgmsgevent.h2
-rw-r--r--src/bin/psql/.cvsignore3
-rw-r--r--src/bin/psql/.gitignore8
-rw-r--r--src/bin/psql/Makefile8
-rw-r--r--src/bin/psql/command.c415
-rw-r--r--src/bin/psql/command.h4
-rw-r--r--src/bin/psql/common.c23
-rw-r--r--src/bin/psql/common.h6
-rw-r--r--src/bin/psql/copy.c4
-rw-r--r--src/bin/psql/copy.h4
-rw-r--r--src/bin/psql/create_help.pl4
-rw-r--r--src/bin/psql/describe.c801
-rw-r--r--src/bin/psql/describe.h20
-rw-r--r--src/bin/psql/help.c92
-rw-r--r--src/bin/psql/help.h4
-rw-r--r--src/bin/psql/input.c4
-rw-r--r--src/bin/psql/input.h4
-rw-r--r--src/bin/psql/large_obj.c6
-rw-r--r--src/bin/psql/large_obj.h4
-rw-r--r--src/bin/psql/mainloop.c4
-rw-r--r--src/bin/psql/mainloop.h4
-rw-r--r--src/bin/psql/mbprint.c28
-rw-r--r--src/bin/psql/mbprint.h2
-rw-r--r--src/bin/psql/nls.mk4
-rw-r--r--src/bin/psql/po/cs.po2
-rw-r--r--src/bin/psql/po/de.po2291
-rw-r--r--src/bin/psql/po/es.po1625
-rw-r--r--src/bin/psql/po/fr.po1198
-rw-r--r--src/bin/psql/po/ja.po1649
-rw-r--r--src/bin/psql/po/pt_BR.po4399
-rw-r--r--src/bin/psql/po/zh_CN.po4343
-rw-r--r--src/bin/psql/po/zh_TW.po5626
-rw-r--r--src/bin/psql/print.c8
-rw-r--r--src/bin/psql/print.h4
-rw-r--r--src/bin/psql/prompt.c4
-rw-r--r--src/bin/psql/prompt.h4
-rw-r--r--src/bin/psql/psqlscan.h4
-rw-r--r--src/bin/psql/psqlscan.l56
-rw-r--r--src/bin/psql/settings.h4
-rw-r--r--src/bin/psql/startup.c18
-rw-r--r--src/bin/psql/stringutils.c4
-rw-r--r--src/bin/psql/stringutils.h4
-rw-r--r--src/bin/psql/tab-complete.c712
-rw-r--r--src/bin/psql/tab-complete.h4
-rw-r--r--src/bin/psql/variables.c4
-rw-r--r--src/bin/psql/variables.h4
-rw-r--r--src/bin/scripts/.gitignore15
-rw-r--r--src/bin/scripts/Makefile24
-rw-r--r--src/bin/scripts/clusterdb.c4
-rw-r--r--src/bin/scripts/common.c4
-rw-r--r--src/bin/scripts/common.h4
-rw-r--r--src/bin/scripts/createdb.c15
-rw-r--r--src/bin/scripts/createlang.c14
-rw-r--r--src/bin/scripts/createuser.c4
-rw-r--r--src/bin/scripts/dropdb.c8
-rw-r--r--src/bin/scripts/droplang.c166
-rw-r--r--src/bin/scripts/dropuser.c4
-rw-r--r--src/bin/scripts/nls.mk4
-rw-r--r--src/bin/scripts/po/cs.po2
-rw-r--r--src/bin/scripts/po/fr.po2
-rw-r--r--src/bin/scripts/po/ja.po448
-rw-r--r--src/bin/scripts/po/ko.po565
-rw-r--r--src/bin/scripts/po/ro.po905
-rw-r--r--src/bin/scripts/po/tr.po160
-rw-r--r--src/bin/scripts/po/zh_CN.po672
-rw-r--r--src/bin/scripts/po/zh_TW.po954
-rw-r--r--src/bin/scripts/reindexdb.c4
-rw-r--r--src/bin/scripts/vacuumdb.c4
-rw-r--r--src/include/.gitignore5
-rw-r--r--src/include/Makefile8
-rw-r--r--src/include/access/attnum.h4
-rw-r--r--src/include/access/clog.h4
-rw-r--r--src/include/access/genam.h14
-rw-r--r--src/include/access/gin.h594
-rw-r--r--src/include/access/gin_private.h722
-rw-r--r--src/include/access/gist.h25
-rw-r--r--src/include/access/gist_private.h211
-rw-r--r--src/include/access/gistscan.h4
-rw-r--r--src/include/access/hash.h13
-rw-r--r--src/include/access/heapam.h8
-rw-r--r--src/include/access/hio.h8
-rw-r--r--src/include/access/htup.h27
-rw-r--r--src/include/access/itup.h6
-rw-r--r--src/include/access/multixact.h4
-rw-r--r--src/include/access/nbtree.h5
-rw-r--r--src/include/access/printtup.h4
-rw-r--r--src/include/access/reloptions.h4
-rw-r--r--src/include/access/relscan.h17
-rw-r--r--src/include/access/rewriteheap.h6
-rw-r--r--src/include/access/rmgr.h2
-rw-r--r--src/include/access/sdir.h4
-rw-r--r--src/include/access/skey.h30
-rw-r--r--src/include/access/slru.h9
-rw-r--r--src/include/access/subtrans.h4
-rw-r--r--src/include/access/sysattr.h4
-rw-r--r--src/include/access/transam.h4
-rw-r--r--src/include/access/tupconvert.h4
-rw-r--r--src/include/access/tupdesc.h10
-rw-r--r--src/include/access/tupmacs.h4
-rw-r--r--src/include/access/tuptoaster.h4
-rw-r--r--src/include/access/twophase.h4
-rw-r--r--src/include/access/twophase_rmgr.h9
-rw-r--r--src/include/access/valid.h9
-rw-r--r--src/include/access/visibilitymap.h4
-rw-r--r--src/include/access/xact.h35
-rw-r--r--src/include/access/xlog.h34
-rw-r--r--src/include/access/xlog_internal.h23
-rw-r--r--src/include/access/xlogdefs.h23
-rw-r--r--src/include/access/xlogutils.h4
-rw-r--r--src/include/bootstrap/bootstrap.h4
-rw-r--r--src/include/c.h8
-rw-r--r--src/include/catalog/.gitignore1
-rw-r--r--src/include/catalog/catalog.h19
-rw-r--r--src/include/catalog/catversion.h8
-rw-r--r--src/include/catalog/dependency.h36
-rwxr-xr-xsrc/include/catalog/duplicate_oids9
-rw-r--r--src/include/catalog/genbki.h7
-rw-r--r--src/include/catalog/heap.h10
-rw-r--r--src/include/catalog/index.h33
-rw-r--r--src/include/catalog/indexing.h26
-rw-r--r--src/include/catalog/namespace.h22
-rw-r--r--src/include/catalog/objectaccess.h46
-rw-r--r--src/include/catalog/objectaddress.h37
-rw-r--r--src/include/catalog/pg_aggregate.h18
-rw-r--r--src/include/catalog/pg_am.h66
-rw-r--r--src/include/catalog/pg_amop.h751
-rw-r--r--src/include/catalog/pg_amproc.h6
-rw-r--r--src/include/catalog/pg_attrdef.h12
-rw-r--r--src/include/catalog/pg_attribute.h18
-rw-r--r--src/include/catalog/pg_auth_members.h4
-rw-r--r--src/include/catalog/pg_authid.h16
-rw-r--r--src/include/catalog/pg_cast.h11
-rw-r--r--src/include/catalog/pg_class.h46
-rw-r--r--src/include/catalog/pg_collation.h76
-rw-r--r--src/include/catalog/pg_collation_fn.h23
-rw-r--r--src/include/catalog/pg_constraint.h53
-rw-r--r--src/include/catalog/pg_control.h9
-rw-r--r--src/include/catalog/pg_conversion.h4
-rw-r--r--src/include/catalog/pg_conversion_fn.h4
-rw-r--r--src/include/catalog/pg_database.h6
-rw-r--r--src/include/catalog/pg_db_role_setting.h4
-rw-r--r--src/include/catalog/pg_default_acl.h4
-rw-r--r--src/include/catalog/pg_depend.h4
-rw-r--r--src/include/catalog/pg_description.h4
-rw-r--r--src/include/catalog/pg_enum.h15
-rw-r--r--src/include/catalog/pg_extension.h74
-rw-r--r--src/include/catalog/pg_foreign_data_wrapper.h16
-rw-r--r--src/include/catalog/pg_foreign_server.h4
-rw-r--r--src/include/catalog/pg_foreign_table.h53
-rw-r--r--src/include/catalog/pg_index.h34
-rw-r--r--src/include/catalog/pg_inherits.h4
-rw-r--r--src/include/catalog/pg_inherits_fn.h4
-rw-r--r--src/include/catalog/pg_language.h4
-rw-r--r--src/include/catalog/pg_largeobject.h4
-rw-r--r--[-rwxr-xr-x]src/include/catalog/pg_largeobject_metadata.h4
-rw-r--r--src/include/catalog/pg_namespace.h4
-rw-r--r--src/include/catalog/pg_opclass.h5
-rw-r--r--src/include/catalog/pg_operator.h726
-rw-r--r--src/include/catalog/pg_opfamily.h5
-rw-r--r--src/include/catalog/pg_pltemplate.h12
-rw-r--r--src/include/catalog/pg_proc.h1282
-rw-r--r--src/include/catalog/pg_proc_fn.h4
-rw-r--r--src/include/catalog/pg_rewrite.h8
-rw-r--r--src/include/catalog/pg_seclabel.h43
-rw-r--r--src/include/catalog/pg_shdepend.h4
-rw-r--r--src/include/catalog/pg_shdescription.h4
-rw-r--r--src/include/catalog/pg_statistic.h6
-rw-r--r--src/include/catalog/pg_tablespace.h4
-rw-r--r--src/include/catalog/pg_trigger.h37
-rw-r--r--src/include/catalog/pg_ts_config.h4
-rw-r--r--src/include/catalog/pg_ts_config_map.h4
-rw-r--r--src/include/catalog/pg_ts_dict.h4
-rw-r--r--src/include/catalog/pg_ts_parser.h4
-rw-r--r--src/include/catalog/pg_ts_template.h4
-rw-r--r--src/include/catalog/pg_type.h296
-rw-r--r--src/include/catalog/pg_type_fn.h8
-rw-r--r--src/include/catalog/pg_user_mapping.h4
-rw-r--r--src/include/catalog/storage.h11
-rw-r--r--src/include/catalog/toasting.h5
-rwxr-xr-xsrc/include/catalog/unused_oids2
-rw-r--r--src/include/commands/alter.h11
-rw-r--r--src/include/commands/async.h4
-rw-r--r--src/include/commands/cluster.h8
-rw-r--r--src/include/commands/collationcmds.h28
-rw-r--r--src/include/commands/comment.h4
-rw-r--r--src/include/commands/conversioncmds.h6
-rw-r--r--src/include/commands/copy.h18
-rw-r--r--src/include/commands/dbcommands.h12
-rw-r--r--src/include/commands/defrem.h28
-rw-r--r--src/include/commands/discard.h4
-rw-r--r--src/include/commands/explain.h15
-rw-r--r--src/include/commands/extension.h49
-rw-r--r--src/include/commands/lockcmds.h4
-rw-r--r--src/include/commands/portalcmds.h4
-rw-r--r--src/include/commands/prepare.h4
-rw-r--r--src/include/commands/proclang.h3
-rw-r--r--src/include/commands/schemacmds.h4
-rw-r--r--src/include/commands/seclabel.h35
-rw-r--r--src/include/commands/sequence.h8
-rw-r--r--src/include/commands/tablecmds.h23
-rw-r--r--src/include/commands/tablespace.h8
-rw-r--r--src/include/commands/trigger.h71
-rw-r--r--src/include/commands/typecmds.h8
-rw-r--r--src/include/commands/user.h2
-rw-r--r--src/include/commands/vacuum.h33
-rw-r--r--src/include/commands/variable.h39
-rw-r--r--src/include/commands/view.h4
-rw-r--r--src/include/executor/execdebug.h4
-rw-r--r--src/include/executor/execdefs.h33
-rw-r--r--src/include/executor/execdesc.h4
-rw-r--r--src/include/executor/executor.h38
-rw-r--r--src/include/executor/functions.h13
-rw-r--r--src/include/executor/hashjoin.h11
-rw-r--r--src/include/executor/instrument.h4
-rw-r--r--src/include/executor/nodeAgg.h6
-rw-r--r--src/include/executor/nodeAppend.h6
-rw-r--r--src/include/executor/nodeBitmapAnd.h6
-rw-r--r--src/include/executor/nodeBitmapHeapscan.h6
-rw-r--r--src/include/executor/nodeBitmapIndexscan.h6
-rw-r--r--src/include/executor/nodeBitmapOr.h6
-rw-r--r--src/include/executor/nodeCtescan.h6
-rw-r--r--src/include/executor/nodeForeignscan.h24
-rw-r--r--src/include/executor/nodeFunctionscan.h6
-rw-r--r--src/include/executor/nodeGroup.h6
-rw-r--r--src/include/executor/nodeHash.h16
-rw-r--r--src/include/executor/nodeHashjoin.h6
-rw-r--r--src/include/executor/nodeIndexscan.h10
-rw-r--r--src/include/executor/nodeLimit.h6
-rw-r--r--src/include/executor/nodeLockRows.h6
-rw-r--r--src/include/executor/nodeMaterial.h6
-rw-r--r--src/include/executor/nodeMergeAppend.h24
-rw-r--r--src/include/executor/nodeMergejoin.h6
-rw-r--r--src/include/executor/nodeModifyTable.h6
-rw-r--r--src/include/executor/nodeNestloop.h6
-rw-r--r--src/include/executor/nodeRecursiveunion.h6
-rw-r--r--src/include/executor/nodeResult.h6
-rw-r--r--src/include/executor/nodeSeqscan.h6
-rw-r--r--src/include/executor/nodeSetOp.h6
-rw-r--r--src/include/executor/nodeSort.h6
-rw-r--r--src/include/executor/nodeSubplan.h4
-rw-r--r--src/include/executor/nodeSubqueryscan.h6
-rw-r--r--src/include/executor/nodeTidscan.h6
-rw-r--r--src/include/executor/nodeUnique.h6
-rw-r--r--src/include/executor/nodeValuesscan.h6
-rw-r--r--src/include/executor/nodeWindowAgg.h6
-rw-r--r--src/include/executor/nodeWorktablescan.h6
-rw-r--r--src/include/executor/spi.h4
-rw-r--r--src/include/executor/spi_priv.h4
-rw-r--r--src/include/executor/tstoreReceiver.h4
-rw-r--r--src/include/executor/tuptable.h4
-rw-r--r--src/include/fmgr.h264
-rw-r--r--src/include/foreign/fdwapi.h98
-rw-r--r--src/include/foreign/foreign.h20
-rw-r--r--src/include/funcapi.h4
-rw-r--r--src/include/getaddrinfo.h4
-rw-r--r--src/include/getopt_long.h5
-rw-r--r--src/include/lib/dllist.h4
-rw-r--r--src/include/lib/stringinfo.h6
-rw-r--r--src/include/libpq/auth.h8
-rw-r--r--src/include/libpq/be-fsstubs.h4
-rw-r--r--src/include/libpq/crypt.h4
-rw-r--r--src/include/libpq/hba.h9
-rw-r--r--src/include/libpq/ip.h4
-rw-r--r--src/include/libpq/libpq-be.h12
-rw-r--r--src/include/libpq/libpq-fs.h4
-rw-r--r--src/include/libpq/libpq.h7
-rw-r--r--src/include/libpq/md5.h4
-rw-r--r--src/include/libpq/pqcomm.h4
-rw-r--r--src/include/libpq/pqformat.h4
-rw-r--r--src/include/libpq/pqsignal.h4
-rw-r--r--src/include/mb/pg_wchar.h17
-rw-r--r--src/include/miscadmin.h34
-rw-r--r--src/include/nodes/bitmapset.h4
-rw-r--r--src/include/nodes/execnodes.h163
-rw-r--r--src/include/nodes/makefuncs.h21
-rw-r--r--src/include/nodes/memnodes.h5
-rw-r--r--src/include/nodes/nodeFuncs.h14
-rw-r--r--src/include/nodes/nodes.h26
-rw-r--r--src/include/nodes/params.h6
-rw-r--r--src/include/nodes/parsenodes.h194
-rw-r--r--src/include/nodes/pg_list.h13
-rw-r--r--src/include/nodes/plannodes.h110
-rw-r--r--src/include/nodes/primnodes.h90
-rw-r--r--src/include/nodes/print.h4
-rw-r--r--src/include/nodes/readfuncs.h4
-rw-r--r--src/include/nodes/relation.h178
-rw-r--r--src/include/nodes/tidbitmap.h4
-rw-r--r--src/include/nodes/value.h4
-rw-r--r--src/include/optimizer/clauses.h17
-rw-r--r--src/include/optimizer/cost.h20
-rw-r--r--src/include/optimizer/geqo.h4
-rw-r--r--src/include/optimizer/geqo_copy.h4
-rw-r--r--src/include/optimizer/geqo_gene.h4
-rw-r--r--src/include/optimizer/geqo_misc.h4
-rw-r--r--src/include/optimizer/geqo_mutation.h4
-rw-r--r--src/include/optimizer/geqo_pool.h4
-rw-r--r--src/include/optimizer/geqo_random.h4
-rw-r--r--src/include/optimizer/geqo_recombination.h4
-rw-r--r--src/include/optimizer/geqo_selection.h4
-rw-r--r--src/include/optimizer/joininfo.h7
-rw-r--r--src/include/optimizer/pathnode.h10
-rw-r--r--src/include/optimizer/paths.h35
-rw-r--r--src/include/optimizer/placeholder.h9
-rw-r--r--src/include/optimizer/plancat.h6
-rw-r--r--src/include/optimizer/planmain.h20
-rw-r--r--src/include/optimizer/planner.h6
-rw-r--r--src/include/optimizer/predtest.h4
-rw-r--r--src/include/optimizer/prep.h6
-rw-r--r--src/include/optimizer/restrictinfo.h4
-rw-r--r--src/include/optimizer/subselect.h7
-rw-r--r--src/include/optimizer/tlist.h5
-rw-r--r--src/include/optimizer/var.h4
-rw-r--r--src/include/parser/.gitignore1
-rw-r--r--src/include/parser/analyze.h4
-rw-r--r--src/include/parser/gramparse.h4
-rw-r--r--src/include/parser/keywords.h4
-rw-r--r--src/include/parser/kwlist.h28
-rw-r--r--src/include/parser/parse_agg.h5
-rw-r--r--src/include/parser/parse_clause.h4
-rw-r--r--src/include/parser/parse_coerce.h4
-rw-r--r--src/include/parser/parse_collate.h27
-rw-r--r--src/include/parser/parse_cte.h4
-rw-r--r--src/include/parser/parse_expr.h4
-rw-r--r--src/include/parser/parse_func.h6
-rw-r--r--src/include/parser/parse_node.h9
-rw-r--r--src/include/parser/parse_oper.h7
-rw-r--r--src/include/parser/parse_param.h4
-rw-r--r--src/include/parser/parse_relation.h8
-rw-r--r--src/include/parser/parse_target.h4
-rw-r--r--src/include/parser/parse_type.h15
-rw-r--r--src/include/parser/parse_utilcmd.h4
-rw-r--r--src/include/parser/parser.h6
-rw-r--r--src/include/parser/parsetree.h6
-rw-r--r--src/include/parser/scanner.h4
-rw-r--r--src/include/parser/scansup.h4
-rw-r--r--src/include/pg_config.h.in24
-rw-r--r--src/include/pg_config.h.win3235
-rw-r--r--src/include/pg_config_manual.h21
-rw-r--r--src/include/pg_trace.h4
-rw-r--r--src/include/pgstat.h63
-rw-r--r--src/include/pgtime.h4
-rw-r--r--src/include/port.h63
-rw-r--r--src/include/port/aix.h2
-rw-r--r--src/include/port/bsdi.h2
-rw-r--r--src/include/port/cygwin.h4
-rw-r--r--src/include/port/darwin.h2
-rw-r--r--src/include/port/dgux.h2
-rw-r--r--src/include/port/freebsd.h2
-rw-r--r--src/include/port/hpux.h2
-rw-r--r--src/include/port/irix.h2
-rw-r--r--src/include/port/linux.h10
-rw-r--r--src/include/port/netbsd.h2
-rw-r--r--src/include/port/nextstep.h2
-rw-r--r--src/include/port/openbsd.h2
-rw-r--r--src/include/port/osf.h2
-rw-r--r--src/include/port/sco.h2
-rw-r--r--src/include/port/solaris.h2
-rw-r--r--src/include/port/sunos4.h2
-rw-r--r--src/include/port/svr4.h2
-rw-r--r--src/include/port/ultrix4.h2
-rw-r--r--src/include/port/univel.h2
-rw-r--r--src/include/port/unixware.h2
-rw-r--r--src/include/port/win32.h106
-rw-r--r--src/include/port/win32/arpa/inet.h2
-rw-r--r--src/include/port/win32/dlfcn.h2
-rw-r--r--src/include/port/win32/grp.h2
-rw-r--r--src/include/port/win32/netdb.h2
-rw-r--r--src/include/port/win32/netinet/in.h2
-rw-r--r--src/include/port/win32/pwd.h2
-rw-r--r--src/include/port/win32/sys/socket.h3
-rw-r--r--src/include/port/win32/sys/wait.h2
-rw-r--r--src/include/port/win32_msvc/dirent.h2
-rw-r--r--src/include/port/win32_msvc/sys/file.h2
-rw-r--r--src/include/port/win32_msvc/sys/param.h2
-rw-r--r--src/include/port/win32_msvc/sys/time.h2
-rw-r--r--src/include/port/win32_msvc/unistd.h2
-rw-r--r--src/include/port/win32_msvc/utime.h2
-rw-r--r--src/include/portability/instr_time.h4
-rw-r--r--src/include/postgres.h4
-rw-r--r--src/include/postgres_ext.h2
-rw-r--r--src/include/postgres_fe.h4
-rw-r--r--src/include/postmaster/autovacuum.h4
-rw-r--r--src/include/postmaster/bgwriter.h6
-rw-r--r--src/include/postmaster/fork_process.h4
-rw-r--r--src/include/postmaster/pgarch.h4
-rw-r--r--src/include/postmaster/postmaster.h5
-rw-r--r--src/include/postmaster/syslogger.h5
-rw-r--r--src/include/postmaster/walwriter.h4
-rw-r--r--src/include/regex/regcustom.h2
-rw-r--r--src/include/regex/regerrs.h2
-rw-r--r--src/include/regex/regex.h6
-rw-r--r--src/include/regex/regguts.h2
-rw-r--r--src/include/replication/basebackup.h19
-rw-r--r--src/include/replication/replnodes.h62
-rw-r--r--src/include/replication/syncrep.h50
-rw-r--r--src/include/replication/walprotocol.h45
-rw-r--r--src/include/replication/walreceiver.h27
-rw-r--r--src/include/replication/walsender.h82
-rw-r--r--src/include/rewrite/prs2lock.h4
-rw-r--r--src/include/rewrite/rewriteDefine.h4
-rw-r--r--src/include/rewrite/rewriteHandler.h4
-rw-r--r--src/include/rewrite/rewriteManip.h4
-rw-r--r--src/include/rewrite/rewriteRemove.h4
-rw-r--r--src/include/rewrite/rewriteSupport.h7
-rw-r--r--src/include/rusagestub.h4
-rw-r--r--src/include/snowball/header.h4
-rw-r--r--src/include/storage/backendid.h6
-rw-r--r--src/include/storage/block.h4
-rw-r--r--src/include/storage/buf.h4
-rw-r--r--src/include/storage/buf_internals.h6
-rw-r--r--src/include/storage/buffile.h4
-rw-r--r--src/include/storage/bufmgr.h16
-rw-r--r--src/include/storage/bufpage.h4
-rw-r--r--src/include/storage/copydir.h19
-rw-r--r--src/include/storage/fd.h5
-rw-r--r--src/include/storage/freespace.h4
-rw-r--r--src/include/storage/fsm_internals.h4
-rw-r--r--src/include/storage/indexfsm.h4
-rw-r--r--src/include/storage/ipc.h4
-rw-r--r--src/include/storage/item.h4
-rw-r--r--src/include/storage/itemid.h4
-rw-r--r--src/include/storage/itemptr.h4
-rw-r--r--src/include/storage/large_object.h4
-rw-r--r--src/include/storage/latch.h59
-rw-r--r--src/include/storage/lmgr.h4
-rw-r--r--src/include/storage/lock.h5
-rw-r--r--src/include/storage/lwlock.h16
-rw-r--r--src/include/storage/off.h4
-rw-r--r--src/include/storage/pg_sema.h4
-rw-r--r--src/include/storage/pg_shmem.h4
-rw-r--r--src/include/storage/pmsignal.h7
-rw-r--r--src/include/storage/pos.h4
-rw-r--r--src/include/storage/predicate.h70
-rw-r--r--src/include/storage/predicate_internals.h473
-rw-r--r--src/include/storage/proc.h18
-rw-r--r--src/include/storage/procarray.h7
-rw-r--r--src/include/storage/procsignal.h4
-rw-r--r--src/include/storage/reinit.h23
-rw-r--r--src/include/storage/relfilenode.h38
-rw-r--r--src/include/storage/s_lock.h6
-rw-r--r--src/include/storage/shmem.h16
-rw-r--r--src/include/storage/sinval.h22
-rw-r--r--src/include/storage/sinvaladt.h4
-rw-r--r--src/include/storage/smgr.h36
-rw-r--r--src/include/storage/spin.h4
-rw-r--r--src/include/storage/standby.h5
-rw-r--r--src/include/tcop/dest.h4
-rw-r--r--src/include/tcop/fastpath.h4
-rw-r--r--src/include/tcop/pquery.h4
-rw-r--r--src/include/tcop/tcopdebug.h4
-rw-r--r--src/include/tcop/tcopprot.h9
-rw-r--r--src/include/tcop/utility.h4
-rw-r--r--src/include/tsearch/dicts/regis.h4
-rw-r--r--src/include/tsearch/dicts/spell.h30
-rw-r--r--src/include/tsearch/ts_cache.h7
-rw-r--r--src/include/tsearch/ts_locale.h4
-rw-r--r--src/include/tsearch/ts_public.h4
-rw-r--r--src/include/tsearch/ts_type.h4
-rw-r--r--src/include/tsearch/ts_utils.h9
-rw-r--r--src/include/utils/.gitignore3
-rw-r--r--src/include/utils/acl.h11
-rw-r--r--src/include/utils/array.h14
-rw-r--r--src/include/utils/ascii.h4
-rw-r--r--src/include/utils/attoptcache.h4
-rw-r--r--src/include/utils/builtins.h71
-rw-r--r--src/include/utils/bytea.h6
-rw-r--r--src/include/utils/cash.h9
-rw-r--r--src/include/utils/catcache.h4
-rw-r--r--src/include/utils/combocid.h4
-rw-r--r--src/include/utils/date.h6
-rw-r--r--src/include/utils/datetime.h20
-rw-r--r--src/include/utils/datum.h4
-rw-r--r--src/include/utils/dynahash.h4
-rw-r--r--src/include/utils/dynamic_loader.h4
-rw-r--r--src/include/utils/elog.h40
-rw-r--r--src/include/utils/errcodes.h356
-rw-r--r--src/include/utils/fmgrtab.h4
-rw-r--r--src/include/utils/formatting.h11
-rw-r--r--src/include/utils/geo_decls.h8
-rw-r--r--src/include/utils/guc.h102
-rw-r--r--src/include/utils/guc_tables.h39
-rw-r--r--src/include/utils/help_config.h4
-rw-r--r--src/include/utils/hsearch.h5
-rw-r--r--src/include/utils/inet.h4
-rw-r--r--src/include/utils/int8.h4
-rw-r--r--src/include/utils/inval.h6
-rw-r--r--src/include/utils/logtape.h4
-rw-r--r--src/include/utils/lsyscache.h23
-rw-r--r--src/include/utils/memutils.h8
-rw-r--r--src/include/utils/nabstime.h4
-rw-r--r--src/include/utils/numeric.h44
-rw-r--r--src/include/utils/palloc.h4
-rw-r--r--src/include/utils/pg_crc.h4
-rw-r--r--src/include/utils/pg_locale.h51
-rw-r--r--src/include/utils/pg_lzcompress.h2
-rw-r--r--src/include/utils/pg_rusage.h4
-rw-r--r--src/include/utils/plancache.h4
-rw-r--r--src/include/utils/portal.h17
-rw-r--r--src/include/utils/ps_status.h2
-rw-r--r--src/include/utils/rbtree.h68
-rw-r--r--src/include/utils/rel.h87
-rw-r--r--src/include/utils/relcache.h7
-rw-r--r--src/include/utils/relmapper.h4
-rw-r--r--src/include/utils/resowner.h4
-rw-r--r--src/include/utils/selfuncs.h8
-rw-r--r--src/include/utils/snapmgr.h7
-rw-r--r--src/include/utils/snapshot.h4
-rw-r--r--src/include/utils/spccache.h4
-rw-r--r--src/include/utils/syscache.h7
-rw-r--r--src/include/utils/timestamp.h4
-rw-r--r--src/include/utils/tqual.h4
-rw-r--r--src/include/utils/tuplesort.h54
-rw-r--r--src/include/utils/tuplestore.h4
-rw-r--r--src/include/utils/typcache.h45
-rw-r--r--src/include/utils/tzparser.h8
-rw-r--r--src/include/utils/uuid.h4
-rw-r--r--src/include/utils/varbit.h10
-rw-r--r--src/include/utils/xml.h13
-rw-r--r--src/include/windowapi.h4
-rw-r--r--src/interfaces/Makefile8
-rw-r--r--src/interfaces/ecpg/Makefile19
-rw-r--r--src/interfaces/ecpg/README.dynSQL5
-rw-r--r--src/interfaces/ecpg/compatlib/.cvsignore3
-rw-r--r--src/interfaces/ecpg/compatlib/.gitignore5
-rw-r--r--src/interfaces/ecpg/compatlib/Makefile15
-rw-r--r--src/interfaces/ecpg/compatlib/exports.txt2
-rw-r--r--src/interfaces/ecpg/compatlib/informix.c15
-rw-r--r--src/interfaces/ecpg/ecpglib/.cvsignore3
-rw-r--r--src/interfaces/ecpg/ecpglib/.gitignore10
-rw-r--r--src/interfaces/ecpg/ecpglib/Makefile15
-rw-r--r--src/interfaces/ecpg/ecpglib/connect.c15
-rw-r--r--src/interfaces/ecpg/ecpglib/data.c2
-rw-r--r--src/interfaces/ecpg/ecpglib/descriptor.c25
-rw-r--r--src/interfaces/ecpg/ecpglib/error.c2
-rw-r--r--src/interfaces/ecpg/ecpglib/execute.c5
-rw-r--r--src/interfaces/ecpg/ecpglib/exports.txt2
-rw-r--r--src/interfaces/ecpg/ecpglib/extern.h5
-rw-r--r--src/interfaces/ecpg/ecpglib/memory.c3
-rw-r--r--src/interfaces/ecpg/ecpglib/misc.c9
-rw-r--r--src/interfaces/ecpg/ecpglib/nls.mk2
-rw-r--r--src/interfaces/ecpg/ecpglib/pg_type.h4
-rw-r--r--src/interfaces/ecpg/ecpglib/po/fr.po2
-rw-r--r--src/interfaces/ecpg/ecpglib/po/ja.po128
-rw-r--r--src/interfaces/ecpg/ecpglib/po/tr.po120
-rw-r--r--src/interfaces/ecpg/ecpglib/po/zh_CN.po22
-rw-r--r--src/interfaces/ecpg/ecpglib/prepare.c12
-rw-r--r--src/interfaces/ecpg/ecpglib/sqlda.c7
-rw-r--r--src/interfaces/ecpg/ecpglib/typename.c2
-rw-r--r--src/interfaces/ecpg/include/.gitignore2
-rw-r--r--src/interfaces/ecpg/include/datetime.h2
-rw-r--r--src/interfaces/ecpg/include/decimal.h2
-rw-r--r--src/interfaces/ecpg/include/ecpg-pthread-win32.h2
-rw-r--r--src/interfaces/ecpg/include/ecpg_config.h.in1
-rw-r--r--src/interfaces/ecpg/include/ecpg_informix.h2
-rw-r--r--src/interfaces/ecpg/include/ecpgerrno.h2
-rw-r--r--src/interfaces/ecpg/include/ecpglib.h4
-rw-r--r--src/interfaces/ecpg/include/ecpgtype.h2
-rw-r--r--src/interfaces/ecpg/include/pgtypes_date.h2
-rw-r--r--src/interfaces/ecpg/include/pgtypes_error.h2
-rw-r--r--src/interfaces/ecpg/include/pgtypes_interval.h2
-rw-r--r--src/interfaces/ecpg/include/pgtypes_timestamp.h2
-rw-r--r--src/interfaces/ecpg/include/sqlda-native.h2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/.cvsignore3
-rw-r--r--src/interfaces/ecpg/pgtypeslib/.gitignore7
-rw-r--r--src/interfaces/ecpg/pgtypeslib/Makefile6
-rw-r--r--src/interfaces/ecpg/pgtypeslib/common.c2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/datetime.c2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/dt.h2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/dt_common.c33
-rw-r--r--src/interfaces/ecpg/pgtypeslib/exports.txt2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/extern.h2
-rw-r--r--src/interfaces/ecpg/pgtypeslib/interval.c4
-rw-r--r--src/interfaces/ecpg/pgtypeslib/numeric.c9
-rw-r--r--src/interfaces/ecpg/pgtypeslib/timestamp.c44
-rw-r--r--src/interfaces/ecpg/preproc/.cvsignore4
-rw-r--r--src/interfaces/ecpg/preproc/.gitignore7
-rw-r--r--src/interfaces/ecpg/preproc/Makefile17
-rw-r--r--src/interfaces/ecpg/preproc/README.parser5
-rw-r--r--src/interfaces/ecpg/preproc/c_keywords.c2
-rw-r--r--[-rwxr-xr-x]src/interfaces/ecpg/preproc/check_rules.pl205
-rw-r--r--src/interfaces/ecpg/preproc/descriptor.c6
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.addons108
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.c12
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.header46
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.tokens7
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.trailer597
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.type5
-rw-r--r--src/interfaces/ecpg/preproc/ecpg_keywords.c2
-rw-r--r--src/interfaces/ecpg/preproc/extern.h17
-rw-r--r--src/interfaces/ecpg/preproc/keywords.c4
-rw-r--r--src/interfaces/ecpg/preproc/nls.mk4
-rw-r--r--src/interfaces/ecpg/preproc/output.c2
-rw-r--r--src/interfaces/ecpg/preproc/parse.pl24
-rw-r--r--src/interfaces/ecpg/preproc/parse2.pl648
-rw-r--r--src/interfaces/ecpg/preproc/parser.c4
-rw-r--r--src/interfaces/ecpg/preproc/pgc.l50
-rw-r--r--src/interfaces/ecpg/preproc/po/es.po264
-rw-r--r--src/interfaces/ecpg/preproc/po/ja.po262
-rw-r--r--src/interfaces/ecpg/preproc/po/ko.po592
-rw-r--r--src/interfaces/ecpg/preproc/po/pt_BR.po149
-rw-r--r--src/interfaces/ecpg/preproc/po/tr.po337
-rw-r--r--src/interfaces/ecpg/preproc/po/zh_CN.po342
-rw-r--r--src/interfaces/ecpg/preproc/po/zh_TW.po646
-rw-r--r--src/interfaces/ecpg/preproc/type.c22
-rw-r--r--src/interfaces/ecpg/preproc/type.h2
-rw-r--r--src/interfaces/ecpg/preproc/variable.c22
-rw-r--r--src/interfaces/ecpg/test/.gitignore5
-rw-r--r--src/interfaces/ecpg/test/Makefile22
-rw-r--r--src/interfaces/ecpg/test/Makefile.regress7
-rw-r--r--src/interfaces/ecpg/test/compat_informix/.gitignore18
-rw-r--r--src/interfaces/ecpg/test/compat_informix/dec_test.pgc4
-rw-r--r--src/interfaces/ecpg/test/compat_informix/describe.pgc2
-rw-r--r--src/interfaces/ecpg/test/compat_informix/sqlda.pgc2
-rw-r--r--src/interfaces/ecpg/test/compat_informix/test_informix.pgc3
-rw-r--r--src/interfaces/ecpg/test/compat_informix/test_informix2.pgc6
-rw-r--r--src/interfaces/ecpg/test/connect/.gitignore10
-rw-r--r--src/interfaces/ecpg/test/connect/Makefile1
-rw-r--r--src/interfaces/ecpg/test/connect/README2
-rw-r--r--src/interfaces/ecpg/test/ecpg_schedule_tcp1
-rw-r--r--[-rwxr-xr-x]src/interfaces/ecpg/test/expected/compat_informix-dec_test-MinGW32.stdout0
-rw-r--r--src/interfaces/ecpg/test/expected/compat_informix-dec_test.c4
-rw-r--r--src/interfaces/ecpg/test/expected/compat_informix-describe.c2
-rw-r--r--src/interfaces/ecpg/test/expected/compat_informix-sqlda.c2
-rw-r--r--src/interfaces/ecpg/test/expected/compat_informix-test_informix.c3
-rw-r--r--src/interfaces/ecpg/test/expected/compat_informix-test_informix2.c4
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-dt_test.c22
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-dt_test.stderr2
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c3
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-nan_test.c10
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-nan_test.stderr2
-rw-r--r--[-rwxr-xr-x]src/interfaces/ecpg/test/expected/pgtypeslib-num_test-MinGW32.stdout0
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-num_test.c12
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-num_test.stderr2
-rw-r--r--[-rwxr-xr-x]src/interfaces/ecpg/test/expected/pgtypeslib-num_test2-MinGW32.stdout0
-rw-r--r--src/interfaces/ecpg/test/expected/pgtypeslib-num_test2.c7
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-array_of_struct.c4
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-comment.c1
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-cursor.c2
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-init.c4
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-outofscope.c27
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-variable.c2
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-whenever.c2
-rw-r--r--src/interfaces/ecpg/test/expected/sql-array.c6
-rw-r--r--src/interfaces/ecpg/test/expected/sql-code100.c10
-rw-r--r--src/interfaces/ecpg/test/expected/sql-describe.c2
-rw-r--r--src/interfaces/ecpg/test/expected/sql-dynalloc.c16
-rw-r--r--src/interfaces/ecpg/test/expected/sql-dynalloc2.c4
-rw-r--r--src/interfaces/ecpg/test/expected/sql-fetch.c2
-rw-r--r--src/interfaces/ecpg/test/expected/sql-fetch.stderr6
-rw-r--r--src/interfaces/ecpg/test/expected/sql-quote.c101
-rw-r--r--src/interfaces/ecpg/test/expected/sql-quote.stderr112
-rw-r--r--src/interfaces/ecpg/test/expected/sql-quote.stdout1
-rw-r--r--src/interfaces/ecpg/test/expected/sql-sqlda.c2
-rw-r--r--src/interfaces/ecpg/test/expected/thread-alloc.c1
-rw-r--r--src/interfaces/ecpg/test/expected/thread-prep.c1
-rw-r--r--src/interfaces/ecpg/test/pg_regress_ecpg.c4
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/.gitignore10
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/Makefile1
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/dt_test.pgc14
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc3
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/nan_test.pgc2
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/num_test.pgc4
-rw-r--r--src/interfaces/ecpg/test/pgtypeslib/num_test2.pgc7
-rw-r--r--src/interfaces/ecpg/test/preproc/.gitignore22
-rw-r--r--src/interfaces/ecpg/test/preproc/array_of_struct.pgc2
-rw-r--r--src/interfaces/ecpg/test/preproc/comment.pgc1
-rw-r--r--src/interfaces/ecpg/test/preproc/cursor.pgc2
-rw-r--r--src/interfaces/ecpg/test/preproc/init.pgc4
-rw-r--r--src/interfaces/ecpg/test/preproc/outofscope.pgc2
-rw-r--r--src/interfaces/ecpg/test/preproc/struct.h1
-rw-r--r--src/interfaces/ecpg/test/preproc/variable.pgc2
-rw-r--r--src/interfaces/ecpg/test/preproc/whenever.pgc2
-rw-r--r--src/interfaces/ecpg/test/resultmap3
-rw-r--r--src/interfaces/ecpg/test/sql/.gitignore40
-rw-r--r--src/interfaces/ecpg/test/sql/Makefile3
-rw-r--r--src/interfaces/ecpg/test/sql/array.pgc6
-rw-r--r--src/interfaces/ecpg/test/sql/code100.pgc12
-rw-r--r--src/interfaces/ecpg/test/sql/describe.pgc2
-rw-r--r--src/interfaces/ecpg/test/sql/dynalloc.pgc16
-rw-r--r--src/interfaces/ecpg/test/sql/dynalloc2.pgc4
-rw-r--r--src/interfaces/ecpg/test/sql/fetch.pgc2
-rw-r--r--src/interfaces/ecpg/test/sql/quote.pgc5
-rw-r--r--src/interfaces/ecpg/test/sql/sqlda.pgc2
-rw-r--r--src/interfaces/ecpg/test/thread/.gitignore10
-rw-r--r--src/interfaces/ecpg/test/thread/Makefile1
-rw-r--r--src/interfaces/ecpg/test/thread/alloc.pgc1
-rw-r--r--src/interfaces/ecpg/test/thread/prep.pgc1
-rw-r--r--src/interfaces/libpq/.cvsignore4
-rw-r--r--src/interfaces/libpq/.gitignore22
-rw-r--r--src/interfaces/libpq/Makefile47
-rw-r--r--src/interfaces/libpq/README2
-rw-r--r--src/interfaces/libpq/bcc32.mak8
-rw-r--r--src/interfaces/libpq/exports.txt5
-rw-r--r--src/interfaces/libpq/fe-auth.c89
-rw-r--r--src/interfaces/libpq/fe-auth.h4
-rw-r--r--src/interfaces/libpq/fe-connect.c333
-rw-r--r--src/interfaces/libpq/fe-exec.c43
-rw-r--r--src/interfaces/libpq/fe-lobj.c4
-rw-r--r--src/interfaces/libpq/fe-misc.c12
-rw-r--r--src/interfaces/libpq/fe-print.c4
-rw-r--r--src/interfaces/libpq/fe-protocol2.c69
-rw-r--r--src/interfaces/libpq/fe-protocol3.c35
-rw-r--r--src/interfaces/libpq/fe-secure.c72
-rw-r--r--src/interfaces/libpq/libpq-events.c4
-rw-r--r--src/interfaces/libpq/libpq-events.h4
-rw-r--r--src/interfaces/libpq/libpq-fe.h21
-rw-r--r--src/interfaces/libpq/libpq-int.h23
-rw-r--r--src/interfaces/libpq/libpq.rc.in10
-rw-r--r--src/interfaces/libpq/nls.mk4
-rw-r--r--src/interfaces/libpq/pg_service.conf.sample4
-rw-r--r--src/interfaces/libpq/po/cs.po2
-rw-r--r--src/interfaces/libpq/po/de.po360
-rw-r--r--src/interfaces/libpq/po/es.po350
-rw-r--r--src/interfaces/libpq/po/fr.po284
-rw-r--r--src/interfaces/libpq/po/ja.po324
-rw-r--r--src/interfaces/libpq/po/ko.po437
-rw-r--r--src/interfaces/libpq/po/pt_BR.po222
-rw-r--r--src/interfaces/libpq/po/ru.po947
-rw-r--r--src/interfaces/libpq/po/sv.po218
-rw-r--r--src/interfaces/libpq/po/tr.po416
-rw-r--r--src/interfaces/libpq/po/zh_CN.po726
-rw-r--r--src/interfaces/libpq/po/zh_TW.po1075
-rw-r--r--src/interfaces/libpq/pqexpbuffer.c4
-rw-r--r--src/interfaces/libpq/pqexpbuffer.h8
-rw-r--r--src/interfaces/libpq/pqsignal.c9
-rw-r--r--src/interfaces/libpq/pqsignal.h6
-rw-r--r--src/interfaces/libpq/pthread-win32.c4
-rw-r--r--src/interfaces/libpq/win32.c4
-rw-r--r--src/interfaces/libpq/win32.h2
-rw-r--r--src/interfaces/libpq/win32.mak10
-rw-r--r--src/makefiles/Makefile2
-rw-r--r--src/makefiles/Makefile.cygwin2
-rw-r--r--src/makefiles/Makefile.darwin2
-rw-r--r--src/makefiles/Makefile.irix2
-rw-r--r--src/makefiles/Makefile.solaris2
-rw-r--r--src/makefiles/Makefile.win322
-rw-r--r--src/makefiles/pgxs.mk86
-rw-r--r--src/nls-global.mk14
-rw-r--r--src/pl/Makefile12
-rw-r--r--src/pl/plperl/.gitignore9
-rw-r--r--src/pl/plperl/GNUmakefile36
-rw-r--r--src/pl/plperl/README2
-rw-r--r--src/pl/plperl/SPI.xs72
-rw-r--r--src/pl/plperl/Util.xs113
-rw-r--r--src/pl/plperl/expected/plperl.out124
-rw-r--r--src/pl/plperl/expected/plperl_array.out222
-rw-r--r--src/pl/plperl/expected/plperl_plperlu.out2
-rw-r--r--src/pl/plperl/expected/plperl_shared.out19
-rw-r--r--src/pl/plperl/expected/plperl_trigger.out249
-rw-r--r--src/pl/plperl/expected/plperl_util.out18
-rw-r--r--src/pl/plperl/nls.mk4
-rw-r--r--src/pl/plperl/plc_perlboot.pl92
-rw-r--r--src/pl/plperl/plc_trusted.pl6
-rw-r--r--src/pl/plperl/plperl--1.0.sql9
-rw-r--r--src/pl/plperl/plperl--unpackaged--1.0.sql7
-rw-r--r--src/pl/plperl/plperl.c1528
-rw-r--r--src/pl/plperl/plperl.control7
-rw-r--r--src/pl/plperl/plperl.h53
-rw-r--r--src/pl/plperl/plperl_helpers.h71
-rw-r--r--src/pl/plperl/plperlu--1.0.sql9
-rw-r--r--src/pl/plperl/plperlu--unpackaged--1.0.sql7
-rw-r--r--src/pl/plperl/plperlu.control7
-rw-r--r--src/pl/plperl/po/de.po91
-rw-r--r--src/pl/plperl/po/es.po84
-rw-r--r--src/pl/plperl/po/fr.po2
-rw-r--r--src/pl/plperl/po/ja.po130
-rw-r--r--src/pl/plperl/po/pt_BR.po126
-rw-r--r--src/pl/plperl/po/ro.po162
-rw-r--r--src/pl/plperl/po/tr.po129
-rw-r--r--src/pl/plperl/po/zh_CN.po157
-rw-r--r--src/pl/plperl/po/zh_TW.po176
-rw-r--r--src/pl/plperl/sql/plperl.sql54
-rw-r--r--src/pl/plperl/sql/plperl_array.sql164
-rw-r--r--src/pl/plperl/sql/plperl_plperlu.sql4
-rw-r--r--src/pl/plperl/sql/plperl_shared.sql11
-rw-r--r--src/pl/plperl/sql/plperl_trigger.sql144
-rw-r--r--src/pl/plperl/sql/plperl_util.sql12
-rw-r--r--src/pl/plperl/text2macro.pl5
-rw-r--r--src/pl/plpgsql/Makefile2
-rw-r--r--src/pl/plpgsql/src/.cvsignore2
-rw-r--r--src/pl/plpgsql/src/.gitignore3
-rw-r--r--src/pl/plpgsql/src/Makefile31
-rw-r--r--src/pl/plpgsql/src/generate-plerrcodes.pl40
-rw-r--r--src/pl/plpgsql/src/gram.y194
-rw-r--r--src/pl/plpgsql/src/nls.mk4
-rw-r--r--src/pl/plpgsql/src/pl_comp.c193
-rw-r--r--src/pl/plpgsql/src/pl_exec.c534
-rw-r--r--src/pl/plpgsql/src/pl_funcs.c425
-rw-r--r--src/pl/plpgsql/src/pl_handler.c16
-rw-r--r--src/pl/plpgsql/src/pl_scanner.c21
-rw-r--r--src/pl/plpgsql/src/plerrcodes.h784
-rw-r--r--src/pl/plpgsql/src/plpgsql--1.0.sql9
-rw-r--r--src/pl/plpgsql/src/plpgsql--unpackaged--1.0.sql7
-rw-r--r--src/pl/plpgsql/src/plpgsql.control7
-rw-r--r--src/pl/plpgsql/src/plpgsql.h49
-rw-r--r--src/pl/plpgsql/src/po/de.po375
-rw-r--r--src/pl/plpgsql/src/po/es.po327
-rw-r--r--src/pl/plpgsql/src/po/fr.po2
-rw-r--r--src/pl/plpgsql/src/po/ja.po485
-rw-r--r--src/pl/plpgsql/src/po/ko.po723
-rw-r--r--src/pl/plpgsql/src/po/pt_BR.po669
-rw-r--r--src/pl/plpgsql/src/po/ro.po703
-rw-r--r--src/pl/plpgsql/src/po/zh_CN.po743
-rw-r--r--src/pl/plpgsql/src/po/zh_TW.po789
-rw-r--r--src/pl/plpython/.gitignore5
-rw-r--r--src/pl/plpython/Makefile66
-rw-r--r--src/pl/plpython/expected/README16
-rw-r--r--src/pl/plpython/expected/plpython_composite.out361
-rw-r--r--src/pl/plpython/expected/plpython_do.out7
-rw-r--r--src/pl/plpython/expected/plpython_drop.out3
-rw-r--r--src/pl/plpython/expected/plpython_error.out300
-rw-r--r--src/pl/plpython/expected/plpython_error_0.out381
-rw-r--r--src/pl/plpython/expected/plpython_newline.out2
-rw-r--r--src/pl/plpython/expected/plpython_quote.out56
-rw-r--r--src/pl/plpython/expected/plpython_record.out10
-rw-r--r--src/pl/plpython/expected/plpython_schema.out2
-rw-r--r--src/pl/plpython/expected/plpython_setof.out17
-rw-r--r--src/pl/plpython/expected/plpython_spi.out1
-rw-r--r--src/pl/plpython/expected/plpython_subtransaction.out411
-rw-r--r--src/pl/plpython/expected/plpython_subtransaction_0.out384
-rw-r--r--src/pl/plpython/expected/plpython_subtransaction_5.out384
-rw-r--r--src/pl/plpython/expected/plpython_test.out22
-rw-r--r--src/pl/plpython/expected/plpython_trigger.out124
-rw-r--r--src/pl/plpython/expected/plpython_types.out63
-rw-r--r--src/pl/plpython/expected/plpython_types_3.out63
-rw-r--r--src/pl/plpython/expected/plpython_unicode.out1
-rw-r--r--src/pl/plpython/expected/plpython_unicode_0.out50
-rw-r--r--src/pl/plpython/expected/plpython_unicode_2.out52
-rw-r--r--src/pl/plpython/expected/plpython_unicode_3.out14
-rw-r--r--src/pl/plpython/generate-spiexceptions.pl44
-rw-r--r--src/pl/plpython/nls.mk4
-rw-r--r--src/pl/plpython/plpython.c1907
-rw-r--r--src/pl/plpython/plpython2u--1.0.sql9
-rw-r--r--src/pl/plpython/plpython2u--unpackaged--1.0.sql7
-rw-r--r--src/pl/plpython/plpython2u.control7
-rw-r--r--src/pl/plpython/plpython3u--1.0.sql9
-rw-r--r--src/pl/plpython/plpython3u--unpackaged--1.0.sql7
-rw-r--r--src/pl/plpython/plpython3u.control7
-rw-r--r--src/pl/plpython/plpythonu--1.0.sql9
-rw-r--r--src/pl/plpython/plpythonu--unpackaged--1.0.sql7
-rw-r--r--src/pl/plpython/plpythonu.control7
-rw-r--r--src/pl/plpython/po/de.po228
-rw-r--r--src/pl/plpython/po/es.po170
-rw-r--r--src/pl/plpython/po/fr.po166
-rw-r--r--src/pl/plpython/po/ja.po224
-rw-r--r--src/pl/plpython/po/pt_BR.po265
-rw-r--r--src/pl/plpython/po/ro.po312
-rw-r--r--src/pl/plpython/po/tr.po245
-rw-r--r--src/pl/plpython/po/zh_CN.po337
-rw-r--r--src/pl/plpython/po/zh_TW.po314
-rw-r--r--src/pl/plpython/sql/plpython_composite.sql168
-rw-r--r--src/pl/plpython/sql/plpython_drop.sql4
-rw-r--r--src/pl/plpython/sql/plpython_error.sql171
-rw-r--r--src/pl/plpython/sql/plpython_newline.sql2
-rw-r--r--src/pl/plpython/sql/plpython_populate.sql3
-rw-r--r--src/pl/plpython/sql/plpython_quote.sql33
-rw-r--r--src/pl/plpython/sql/plpython_record.sql4
-rw-r--r--src/pl/plpython/sql/plpython_schema.sql3
-rw-r--r--src/pl/plpython/sql/plpython_setof.sql11
-rw-r--r--src/pl/plpython/sql/plpython_spi.sql2
-rw-r--r--src/pl/plpython/sql/plpython_subtransaction.sql244
-rw-r--r--src/pl/plpython/sql/plpython_test.sql13
-rw-r--r--src/pl/plpython/sql/plpython_trigger.sql62
-rw-r--r--src/pl/plpython/sql/plpython_types.sql43
-rw-r--r--src/pl/plpython/sql/plpython_unicode.sql2
-rw-r--r--src/pl/tcl/.gitignore4
-rw-r--r--src/pl/tcl/Makefile33
-rw-r--r--src/pl/tcl/expected/pltcl_queries.out37
-rw-r--r--src/pl/tcl/expected/pltcl_queries_1.out37
-rw-r--r--src/pl/tcl/expected/pltcl_setup.out14
-rw-r--r--src/pl/tcl/modules/.gitignore3
-rw-r--r--src/pl/tcl/modules/Makefile2
-rw-r--r--src/pl/tcl/modules/README3
-rw-r--r--src/pl/tcl/modules/pltcl_delmod.in2
-rw-r--r--src/pl/tcl/modules/pltcl_listmod.in2
-rw-r--r--src/pl/tcl/modules/unknown.pltcl2
-rw-r--r--src/pl/tcl/nls.mk4
-rw-r--r--src/pl/tcl/pltcl--1.0.sql9
-rw-r--r--src/pl/tcl/pltcl--unpackaged--1.0.sql5
-rw-r--r--src/pl/tcl/pltcl.c353
-rw-r--r--src/pl/tcl/pltcl.control7
-rw-r--r--src/pl/tcl/pltclu--1.0.sql9
-rw-r--r--src/pl/tcl/pltclu--unpackaged--1.0.sql5
-rw-r--r--src/pl/tcl/pltclu.control7
-rw-r--r--src/pl/tcl/po/fr.po2
-rw-r--r--src/pl/tcl/po/ja.po21
-rw-r--r--src/pl/tcl/po/ro.po55
-rw-r--r--src/pl/tcl/po/zh_CN.po53
-rw-r--r--src/pl/tcl/po/zh_TW.po56
-rw-r--r--src/pl/tcl/sql/pltcl_queries.sql9
-rw-r--r--src/pl/tcl/sql/pltcl_setup.sql15
-rw-r--r--src/port/.gitignore3
-rw-r--r--src/port/Makefile21
-rw-r--r--src/port/README3
-rw-r--r--src/port/chklocale.c27
-rw-r--r--src/port/crypt.c4
-rw-r--r--src/port/dirent.c9
-rw-r--r--src/port/dirmod.c126
-rw-r--r--src/port/erand48.c2
-rw-r--r--src/port/exec.c7
-rw-r--r--src/port/fseeko.c4
-rw-r--r--src/port/getaddrinfo.c19
-rw-r--r--src/port/gethostname.c4
-rw-r--r--src/port/getopt.c29
-rw-r--r--src/port/getopt_long.c22
-rw-r--r--src/port/getpeereid.c80
-rw-r--r--src/port/getrusage.c4
-rw-r--r--src/port/gettimeofday.c2
-rw-r--r--src/port/inet_aton.c2
-rw-r--r--src/port/inet_net_ntop.c298
-rw-r--r--src/port/isinf.c4
-rw-r--r--src/port/kill.c4
-rw-r--r--src/port/memcmp.c4
-rw-r--r--src/port/noblock.c8
-rw-r--r--src/port/open.c4
-rw-r--r--src/port/path.c54
-rw-r--r--src/port/pgcheckdir.c73
-rw-r--r--src/port/pgmkdirp.c148
-rw-r--r--src/port/pgsleep.c4
-rw-r--r--src/port/pgstrcasecmp.c30
-rw-r--r--src/port/pthread-win32.h2
-rw-r--r--src/port/qsort.c2
-rw-r--r--src/port/qsort_arg.c2
-rw-r--r--src/port/random.c4
-rw-r--r--src/port/rint.c2
-rw-r--r--src/port/snprintf.c52
-rw-r--r--src/port/sprompt.c4
-rw-r--r--src/port/srandom.c4
-rw-r--r--src/port/strdup.c4
-rw-r--r--src/port/strerror.c2
-rw-r--r--src/port/strlcat.c2
-rw-r--r--src/port/strlcpy.c4
-rw-r--r--src/port/strtol.c4
-rw-r--r--src/port/strtoul.c2
-rw-r--r--src/port/thread.c4
-rw-r--r--src/port/unsetenv.c7
-rw-r--r--src/port/win32env.c9
-rw-r--r--src/port/win32error.c8
-rw-r--r--src/port/win32ver.rc6
-rw-r--r--src/template/cygwin2
-rw-r--r--src/template/darwin6
-rw-r--r--src/template/dgux4
-rw-r--r--src/template/freebsd2
-rw-r--r--src/template/hpux2
-rw-r--r--src/template/linux2
-rw-r--r--src/template/netbsd3
-rw-r--r--src/template/nextstep2
-rw-r--r--src/template/osf2
-rw-r--r--src/template/sco1
-rw-r--r--src/test/Makefile2
-rw-r--r--src/test/examples/Makefile2
-rw-r--r--src/test/examples/testlibpq.c2
-rw-r--r--src/test/examples/testlibpq2.c2
-rw-r--r--src/test/examples/testlibpq3.c2
-rw-r--r--src/test/examples/testlibpq4.c8
-rw-r--r--src/test/examples/testlo.c4
-rw-r--r--src/test/isolation/.gitignore12
-rw-r--r--src/test/isolation/Makefile84
-rw-r--r--src/test/isolation/README65
-rw-r--r--src/test/isolation/expected/classroom-scheduling.out299
-rw-r--r--src/test/isolation/expected/multiple-row-versions.out24
-rw-r--r--src/test/isolation/expected/partial-index.out641
-rw-r--r--src/test/isolation/expected/project-manager.out299
-rw-r--r--src/test/isolation/expected/receipt-report.out3379
-rw-r--r--src/test/isolation/expected/referential-integrity.out629
-rw-r--r--src/test/isolation/expected/ri-trigger.out111
-rw-r--r--src/test/isolation/expected/simple-write-skew.out41
-rw-r--r--src/test/isolation/expected/temporal-range-integrity.out299
-rw-r--r--src/test/isolation/expected/total-cash.out281
-rw-r--r--src/test/isolation/expected/two-ids.out1007
-rw-r--r--src/test/isolation/isolation_main.c89
-rw-r--r--src/test/isolation/isolation_schedule11
-rw-r--r--src/test/isolation/isolationtester.c374
-rw-r--r--src/test/isolation/isolationtester.h59
-rw-r--r--src/test/isolation/specparse.y185
-rw-r--r--src/test/isolation/specs/classroom-scheduling.spec29
-rw-r--r--src/test/isolation/specs/multiple-row-versions.spec47
-rw-r--r--src/test/isolation/specs/partial-index.spec32
-rw-r--r--src/test/isolation/specs/project-manager.spec30
-rw-r--r--src/test/isolation/specs/receipt-report.spec47
-rw-r--r--src/test/isolation/specs/referential-integrity.spec32
-rw-r--r--src/test/isolation/specs/ri-trigger.spec53
-rw-r--r--src/test/isolation/specs/simple-write-skew.spec30
-rw-r--r--src/test/isolation/specs/temporal-range-integrity.spec38
-rw-r--r--src/test/isolation/specs/total-cash.spec28
-rw-r--r--src/test/isolation/specs/two-ids.spec40
-rw-r--r--src/test/isolation/specscanner.l111
-rw-r--r--src/test/locale/Makefile4
-rw-r--r--src/test/locale/README4
-rw-r--r--src/test/locale/de_DE.ISO8859-1/Makefile6
-rw-r--r--src/test/locale/de_DE.ISO8859-1/README2
-rw-r--r--src/test/locale/de_DE.ISO8859-1/test-de-sort.in2
-rw-r--r--src/test/locale/gr_GR.ISO8859-7/Makefile6
-rw-r--r--src/test/locale/gr_GR.ISO8859-7/README2
-rw-r--r--src/test/locale/koi8-r/Makefile6
-rw-r--r--src/test/locale/koi8-to-win1251/Makefile6
-rw-r--r--src/test/locale/koi8-to-win1251/README2
-rw-r--r--src/test/locale/test-ctype.c2
-rw-r--r--src/test/mb/README2
-rw-r--r--src/test/mb/mbregress.sh4
-rwxr-xr-xsrc/test/performance/runtests.pl17
-rw-r--r--src/test/performance/sqls/connection1
-rw-r--r--src/test/performance/sqls/crtsimple1
-rw-r--r--src/test/performance/sqls/crtsimpleidx1
-rw-r--r--src/test/performance/sqls/drpsimple1
-rw-r--r--src/test/performance/sqls/inssimple4
-rw-r--r--src/test/performance/sqls/orbsimple1
-rw-r--r--src/test/performance/sqls/slcsimple1
-rw-r--r--src/test/performance/sqls/vacuum1
-rw-r--r--src/test/regress/.gitignore7
-rw-r--r--src/test/regress/GNUmakefile58
-rw-r--r--src/test/regress/data/array.data3
-rw-r--r--src/test/regress/expected/.gitignore3
-rw-r--r--src/test/regress/expected/abstime.out6
-rw-r--r--src/test/regress/expected/advisory_lock.out275
-rw-r--r--src/test/regress/expected/aggregates.out257
-rw-r--r--src/test/regress/expected/alter_table.out462
-rw-r--r--src/test/regress/expected/arrays.out382
-rw-r--r--src/test/regress/expected/bit.out2
-rw-r--r--src/test/regress/expected/bit_1.out2
-rw-r--r--src/test/regress/expected/bitmapops.out2
-rw-r--r--src/test/regress/expected/boolean.out12
-rw-r--r--src/test/regress/expected/box.out36
-rw-r--r--src/test/regress/expected/box_1.out36
-rw-r--r--src/test/regress/expected/char.out6
-rw-r--r--src/test/regress/expected/char_1.out6
-rw-r--r--src/test/regress/expected/char_2.out6
-rw-r--r--src/test/regress/expected/cluster.out2
-rw-r--r--src/test/regress/expected/cluster_1.out2
-rw-r--r--src/test/regress/expected/collate.linux.utf8.out1050
-rw-r--r--src/test/regress/expected/collate.out598
-rw-r--r--src/test/regress/expected/copy2.out6
-rw-r--r--src/test/regress/expected/copy2_1.out6
-rw-r--r--src/test/regress/expected/copyselect.out2
-rw-r--r--src/test/regress/expected/copyselect_1.out2
-rw-r--r--src/test/regress/expected/create_aggregate.out4
-rw-r--r--src/test/regress/expected/create_index.out616
-rw-r--r--src/test/regress/expected/create_misc.out22
-rw-r--r--src/test/regress/expected/create_misc_1.out (renamed from src/test/regress/output/create_misc_1.source)4
-rw-r--r--src/test/regress/expected/create_operator.out16
-rw-r--r--src/test/regress/expected/create_table.out18
-rw-r--r--src/test/regress/expected/create_table_1.out18
-rw-r--r--src/test/regress/expected/create_type.out14
-rw-r--r--src/test/regress/expected/create_type_1.out14
-rw-r--r--src/test/regress/expected/create_view.out12
-rw-r--r--src/test/regress/expected/create_view_1.out12
-rw-r--r--src/test/regress/expected/delete.out23
-rw-r--r--src/test/regress/expected/delete_1.out22
-rw-r--r--src/test/regress/expected/dependency.out8
-rw-r--r--src/test/regress/expected/dependency_1.out2
-rw-r--r--src/test/regress/expected/domain.out141
-rw-r--r--src/test/regress/expected/domain_1.out159
-rw-r--r--src/test/regress/expected/drop_if_exists.out4
-rw-r--r--src/test/regress/expected/enum.out153
-rw-r--r--src/test/regress/expected/errors.out188
-rw-r--r--src/test/regress/expected/float4-exp-three-digits.out4
-rw-r--r--src/test/regress/expected/float4.out2
-rw-r--r--src/test/regress/expected/float4_1.out2
-rw-r--r--src/test/regress/expected/float8-exp-three-digits-win32.out18
-rw-r--r--src/test/regress/expected/float8-small-is-zero.out18
-rw-r--r--src/test/regress/expected/float8-small-is-zero_1.out18
-rw-r--r--src/test/regress/expected/float8.out14
-rw-r--r--src/test/regress/expected/float8_1.out14
-rw-r--r--src/test/regress/expected/foreign_data.out333
-rw-r--r--src/test/regress/expected/foreign_key.out20
-rw-r--r--src/test/regress/expected/foreign_key_1.out20
-rw-r--r--src/test/regress/expected/functional_deps.out241
-rw-r--r--src/test/regress/expected/functional_deps_1.out240
-rw-r--r--src/test/regress/expected/guc.out5
-rw-r--r--src/test/regress/expected/guc_1.out5
-rw-r--r--src/test/regress/expected/hash_index.out16
-rw-r--r--src/test/regress/expected/hash_index_1.out16
-rw-r--r--src/test/regress/expected/horology.out13
-rw-r--r--src/test/regress/expected/inet.out2
-rw-r--r--src/test/regress/expected/inet_1.out2
-rw-r--r--src/test/regress/expected/inherit.out97
-rw-r--r--src/test/regress/expected/insert.out17
-rw-r--r--src/test/regress/expected/int2.out15
-rw-r--r--src/test/regress/expected/int4.out15
-rw-r--r--src/test/regress/expected/int4_1.out15
-rw-r--r--src/test/regress/expected/int8-exp-three-digits.out35
-rw-r--r--src/test/regress/expected/int8.out13
-rw-r--r--src/test/regress/expected/interval.out12
-rw-r--r--src/test/regress/expected/join.out103
-rw-r--r--src/test/regress/expected/limit.out28
-rw-r--r--src/test/regress/expected/money.out63
-rw-r--r--src/test/regress/expected/money_1.out63
-rw-r--r--src/test/regress/expected/oid.out2
-rw-r--r--src/test/regress/expected/oidjoins.out922
-rw-r--r--src/test/regress/expected/opr_sanity.out186
-rw-r--r--src/test/regress/expected/plancache.out4
-rw-r--r--src/test/regress/expected/plancache_1.out20
-rw-r--r--src/test/regress/expected/plpgsql.out369
-rw-r--r--src/test/regress/expected/point.out6
-rw-r--r--src/test/regress/expected/point_1.out6
-rw-r--r--src/test/regress/expected/polygon.out50
-rw-r--r--src/test/regress/expected/polymorphism.out3
-rw-r--r--src/test/regress/expected/polymorphism_1.out3
-rw-r--r--src/test/regress/expected/portals.out16
-rw-r--r--src/test/regress/expected/portals_p2.out26
-rw-r--r--src/test/regress/expected/prepared_xacts.out12
-rw-r--r--src/test/regress/expected/prepared_xacts_1.out12
-rw-r--r--src/test/regress/expected/rangefuncs.out26
-rw-r--r--src/test/regress/expected/rowtypes.out94
-rw-r--r--src/test/regress/expected/rowtypes_1.out97
-rw-r--r--src/test/regress/expected/rules.out139
-rw-r--r--src/test/regress/expected/sanity_check.out6
-rw-r--r--src/test/regress/expected/select.out13
-rw-r--r--src/test/regress/expected/select_1.out13
-rw-r--r--src/test/regress/expected/select_implicit.out20
-rw-r--r--src/test/regress/expected/select_implicit_1.out20
-rw-r--r--src/test/regress/expected/select_implicit_2.out20
-rw-r--r--src/test/regress/expected/sequence.out10
-rw-r--r--src/test/regress/expected/sequence_1.out11
-rw-r--r--src/test/regress/expected/sequence_2.out10
-rw-r--r--src/test/regress/expected/subselect.out13
-rw-r--r--src/test/regress/expected/text.out190
-rw-r--r--src/test/regress/expected/timestamptz.out7
-rw-r--r--src/test/regress/expected/tinterval.out4
-rw-r--r--src/test/regress/expected/transactions.out79
-rw-r--r--src/test/regress/expected/transactions_1.out78
-rw-r--r--src/test/regress/expected/triggers.out707
-rw-r--r--src/test/regress/expected/triggers_1.out543
-rw-r--r--src/test/regress/expected/truncate.out26
-rw-r--r--src/test/regress/expected/truncate_1.out26
-rw-r--r--src/test/regress/expected/tsdicts.out12
-rw-r--r--src/test/regress/expected/type_sanity.out13
-rw-r--r--src/test/regress/expected/typed_table.out28
-rw-r--r--src/test/regress/expected/typed_table_1.out19
-rw-r--r--src/test/regress/expected/update.out9
-rw-r--r--src/test/regress/expected/update_1.out9
-rw-r--r--src/test/regress/expected/varchar.out6
-rw-r--r--src/test/regress/expected/varchar_1.out6
-rw-r--r--src/test/regress/expected/varchar_2.out6
-rw-r--r--src/test/regress/expected/window.out4
-rw-r--r--src/test/regress/expected/with.out808
-rw-r--r--src/test/regress/expected/with_1.out386
-rw-r--r--src/test/regress/expected/xc_distkey.out2
-rw-r--r--src/test/regress/expected/xml.out184
-rw-r--r--src/test/regress/expected/xml_1.out176
-rw-r--r--src/test/regress/input/constraints.source3
-rw-r--r--src/test/regress/input/copy.source5
-rw-r--r--src/test/regress/input/create_function_2.source5
-rw-r--r--src/test/regress/input/misc.source17
-rw-r--r--src/test/regress/input/security_label.source84
-rw-r--r--src/test/regress/output/constraints.source2
-rw-r--r--src/test/regress/output/constraints_1.source2
-rw-r--r--src/test/regress/output/copy.source4
-rw-r--r--src/test/regress/output/copy_1.source4
-rw-r--r--src/test/regress/output/create_function_2.source4
-rw-r--r--src/test/regress/output/misc.source23
-rw-r--r--src/test/regress/output/security_label.source92
-rw-r--r--src/test/regress/parallel_schedule10
-rw-r--r--src/test/regress/pg_regress.c84
-rw-r--r--src/test/regress/pg_regress.h5
-rw-r--r--src/test/regress/pg_regress_main.c11
-rw-r--r--src/test/regress/regress.c8
-rw-r--r--src/test/regress/resultmap3
-rw-r--r--src/test/regress/serial_schedule8
-rw-r--r--src/test/regress/sql/.gitignore8
-rw-r--r--src/test/regress/sql/abstime.sql6
-rw-r--r--src/test/regress/sql/advisory_lock.sql146
-rw-r--r--src/test/regress/sql/aggregates.sql68
-rw-r--r--src/test/regress/sql/alter_table.sql277
-rw-r--r--src/test/regress/sql/arrays.sql46
-rw-r--r--src/test/regress/sql/bit.sql3
-rw-r--r--src/test/regress/sql/bitmapops.sql2
-rw-r--r--src/test/regress/sql/boolean.sql12
-rw-r--r--src/test/regress/sql/box.sql37
-rw-r--r--src/test/regress/sql/case.sql1
-rw-r--r--src/test/regress/sql/char.sql6
-rw-r--r--src/test/regress/sql/cluster.sql2
-rw-r--r--src/test/regress/sql/collate.linux.utf8.sql387
-rw-r--r--src/test/regress/sql/collate.sql222
-rw-r--r--src/test/regress/sql/copy2.sql6
-rw-r--r--src/test/regress/sql/copyselect.sql2
-rw-r--r--src/test/regress/sql/create_aggregate.sql4
-rw-r--r--src/test/regress/sql/create_index.sql122
-rw-r--r--src/test/regress/sql/create_operator.sql18
-rw-r--r--src/test/regress/sql/create_table.sql17
-rw-r--r--src/test/regress/sql/create_type.sql13
-rw-r--r--src/test/regress/sql/create_view.sql12
-rw-r--r--src/test/regress/sql/delete.sql12
-rw-r--r--src/test/regress/sql/domain.sql80
-rw-r--r--src/test/regress/sql/drop.sql13
-rw-r--r--src/test/regress/sql/drop_if_exists.sql5
-rw-r--r--src/test/regress/sql/enum.sql86
-rw-r--r--src/test/regress/sql/errors.sql192
-rw-r--r--src/test/regress/sql/float4.sql3
-rw-r--r--src/test/regress/sql/float8.sql15
-rw-r--r--src/test/regress/sql/foreign_data.sql79
-rw-r--r--src/test/regress/sql/foreign_key.sql20
-rw-r--r--src/test/regress/sql/functional_deps.sql210
-rw-r--r--src/test/regress/sql/hash_index.sql17
-rw-r--r--src/test/regress/sql/horology.sql3
-rw-r--r--src/test/regress/sql/hs_primary_extremes.sql17
-rw-r--r--src/test/regress/sql/inet.sql2
-rw-r--r--src/test/regress/sql/inherit.sql35
-rw-r--r--src/test/regress/sql/insert.sql7
-rw-r--r--src/test/regress/sql/int2.sql5
-rw-r--r--src/test/regress/sql/int4.sql6
-rw-r--r--src/test/regress/sql/int8.sql4
-rw-r--r--src/test/regress/sql/interval.sql12
-rw-r--r--src/test/regress/sql/join.sql72
-rw-r--r--src/test/regress/sql/limit.sql28
-rw-r--r--src/test/regress/sql/lseg.sql1
-rw-r--r--src/test/regress/sql/money.sql14
-rw-r--r--src/test/regress/sql/numeric_big.sql1
-rw-r--r--src/test/regress/sql/numerology.sql1
-rw-r--r--src/test/regress/sql/oid.sql2
-rw-r--r--src/test/regress/sql/oidjoins.sql818
-rw-r--r--src/test/regress/sql/opr_sanity.sql137
-rw-r--r--src/test/regress/sql/path.sql1
-rw-r--r--src/test/regress/sql/plpgsql.sql227
-rw-r--r--src/test/regress/sql/point.sql6
-rw-r--r--src/test/regress/sql/polygon.sql51
-rw-r--r--src/test/regress/sql/polymorphism.sql3
-rw-r--r--src/test/regress/sql/portals.sql16
-rw-r--r--src/test/regress/sql/portals_p2.sql27
-rw-r--r--src/test/regress/sql/prepare.sql1
-rw-r--r--src/test/regress/sql/prepared_xacts.sql12
-rw-r--r--src/test/regress/sql/random.sql1
-rw-r--r--src/test/regress/sql/rangefuncs.sql25
-rw-r--r--src/test/regress/sql/rowtypes.sql62
-rw-r--r--src/test/regress/sql/rules.sql22
-rw-r--r--src/test/regress/sql/select.sql16
-rw-r--r--src/test/regress/sql/select_implicit.sql21
-rw-r--r--src/test/regress/sql/select_into.sql1
-rw-r--r--src/test/regress/sql/select_views.sql1
-rw-r--r--src/test/regress/sql/sequence.sql11
-rw-r--r--src/test/regress/sql/subselect.sql10
-rw-r--r--src/test/regress/sql/text.sql48
-rw-r--r--src/test/regress/sql/timestamptz.sql2
-rw-r--r--src/test/regress/sql/tinterval.sql4
-rw-r--r--src/test/regress/sql/transactions.sql46
-rw-r--r--src/test/regress/sql/triggers.sql440
-rw-r--r--src/test/regress/sql/truncate.sql14
-rw-r--r--src/test/regress/sql/tsdicts.sql12
-rw-r--r--src/test/regress/sql/type_sanity.sql6
-rw-r--r--src/test/regress/sql/typed_table.sql21
-rw-r--r--src/test/regress/sql/update.sql4
-rw-r--r--src/test/regress/sql/varchar.sql6
-rw-r--r--src/test/regress/sql/window.sql4
-rw-r--r--src/test/regress/sql/with.sql289
-rw-r--r--src/test/regress/sql/xml.sql48
-rw-r--r--src/test/regress/standby_schedule2
-rw-r--r--src/test/thread/Makefile4
-rw-r--r--src/test/thread/README11
-rw-r--r--src/test/thread/thread_test.c24
-rw-r--r--src/timezone/.gitignore1
-rw-r--r--src/timezone/Makefile6
-rw-r--r--src/timezone/README2
-rw-r--r--src/timezone/data/africa63
-rw-r--r--src/timezone/data/asia19
-rw-r--r--src/timezone/data/australasia70
-rw-r--r--src/timezone/data/backward6
-rw-r--r--src/timezone/data/etcetera5
-rw-r--r--src/timezone/data/europe68
-rw-r--r--src/timezone/data/leapseconds14
-rw-r--r--src/timezone/data/northamerica238
-rw-r--r--src/timezone/data/southamerica62
-rw-r--r--src/timezone/data/zone.tab15
-rw-r--r--src/timezone/ialloc.c2
-rw-r--r--src/timezone/localtime.c2
-rw-r--r--src/timezone/pgtz.c63
-rw-r--r--src/timezone/pgtz.h4
-rw-r--r--src/timezone/private.h2
-rw-r--r--src/timezone/scheck.c2
-rw-r--r--src/timezone/strftime.c2
-rw-r--r--src/timezone/tzfile.h2
-rw-r--r--src/timezone/tznames/Africa.txt2
-rw-r--r--src/timezone/tznames/America.txt2
-rw-r--r--src/timezone/tznames/Antarctica.txt2
-rw-r--r--src/timezone/tznames/Asia.txt2
-rw-r--r--src/timezone/tznames/Atlantic.txt2
-rw-r--r--src/timezone/tznames/Australia2
-rw-r--r--src/timezone/tznames/Australia.txt2
-rw-r--r--src/timezone/tznames/Default6
-rw-r--r--src/timezone/tznames/Etc.txt2
-rw-r--r--src/timezone/tznames/Europe.txt2
-rw-r--r--src/timezone/tznames/India2
-rw-r--r--src/timezone/tznames/Indian.txt2
-rw-r--r--src/timezone/tznames/Makefile2
-rw-r--r--src/timezone/tznames/Pacific.txt7
-rw-r--r--src/timezone/tznames/README2
-rw-r--r--src/timezone/zic.c14
-rwxr-xr-xsrc/tools/FAQ2txt2
-rw-r--r--src/tools/RELEASE_CHANGES26
-rwxr-xr-xsrc/tools/add_cvs_markers50
-rw-r--r--src/tools/backend/README5
-rw-r--r--src/tools/backend/backend_dirs.html3
-rw-r--r--src/tools/backend/index.html2
-rwxr-xr-xsrc/tools/ccsym2
-rwxr-xr-xsrc/tools/check_keywords.pl6
-rwxr-xr-xsrc/tools/codelines4
-rwxr-xr-xsrc/tools/copyright3
-rw-r--r--src/tools/editors/emacs.samples9
-rw-r--r--src/tools/editors/vim.samples1
-rw-r--r--src/tools/entab/Makefile9
-rw-r--r--src/tools/entab/entab.c2
-rw-r--r--src/tools/entab/entab.man4
-rw-r--r--src/tools/entab/halt.c2
-rwxr-xr-xsrc/tools/find_badmacros3
-rwxr-xr-xsrc/tools/find_gt_lt2
-rwxr-xr-xsrc/tools/find_static11
-rwxr-xr-xsrc/tools/find_typedef8
-rw-r--r--src/tools/findoidjoins/Makefile8
-rw-r--r--src/tools/findoidjoins/README70
-rw-r--r--src/tools/findoidjoins/findoidjoins.c99
-rwxr-xr-xsrc/tools/findoidjoins/make_oidjoins_check30
-rw-r--r--src/tools/fsync/Makefile25
-rw-r--r--src/tools/fsync/README11
-rw-r--r--src/tools/fsync/test_fsync.c384
-rw-r--r--src/tools/git-external-diff24
-rwxr-xr-xsrc/tools/git_changelog284
-rw-r--r--src/tools/ifaddrs/Makefile4
-rw-r--r--src/tools/ifaddrs/README2
-rw-r--r--src/tools/ifaddrs/test_ifaddrs.c2
-rwxr-xr-xsrc/tools/make_ctags2
-rw-r--r--src/tools/make_diff/README7
-rwxr-xr-xsrc/tools/make_diff/cporig2
-rwxr-xr-xsrc/tools/make_diff/difforig2
-rwxr-xr-xsrc/tools/make_diff/rmorig2
-rwxr-xr-xsrc/tools/make_etags2
-rwxr-xr-xsrc/tools/make_keywords2
-rwxr-xr-xsrc/tools/make_mkid2
-rw-r--r--src/tools/msvc/Install.pm73
-rw-r--r--src/tools/msvc/Mkvcbuild.pm67
-rw-r--r--src/tools/msvc/Project.pm2
-rw-r--r--src/tools/msvc/README10
-rw-r--r--src/tools/msvc/Solution.pm27
-rwxr-xr-xsrc/tools/msvc/build.bat2
-rw-r--r--src/tools/msvc/build.pl3
-rwxr-xr-xsrc/tools/msvc/builddoc.bat68
-rw-r--r--src/tools/msvc/builddoc.pl122
-rwxr-xr-xsrc/tools/msvc/clean.bat3
-rw-r--r--src/tools/msvc/config_default.pl2
-rw-r--r--src/tools/msvc/gendef.pl2
-rw-r--r--src/tools/msvc/install.bat2
-rwxr-xr-xsrc/tools/msvc/install.pl2
-rw-r--r--src/tools/msvc/mkvcbuild.pl2
-rwxr-xr-xsrc/tools/msvc/pgbison.bat4
-rwxr-xr-xsrc/tools/msvc/pgflex.bat19
-rw-r--r--src/tools/msvc/vcregress.bat2
-rw-r--r--src/tools/msvc/vcregress.pl47
-rwxr-xr-xsrc/tools/pgcvslog293
-rw-r--r--src/tools/pginclude/README2
-rw-r--r--src/tools/pginclude/cpluspluscheck47
-rwxr-xr-xsrc/tools/pginclude/pgcheckdefines2
-rwxr-xr-xsrc/tools/pginclude/pgcompinclude2
-rwxr-xr-xsrc/tools/pginclude/pgdefine2
-rwxr-xr-xsrc/tools/pginclude/pgfixinclude2
-rwxr-xr-xsrc/tools/pginclude/pgrminclude8
-rw-r--r--src/tools/pgindent/README17
-rw-r--r--src/tools/pgindent/indent.bsd.patch2
-rwxr-xr-xsrc/tools/pgindent/pgcppindent2
-rwxr-xr-xsrc/tools/pgindent/pgindent23
-rw-r--r--src/tools/pgindent/typedefs.list325
-rwxr-xr-xsrc/tools/pgtest12
-rwxr-xr-xsrc/tools/version_stamp.pl6
-rwxr-xr-xsrc/tools/win32tzlist.pl4
-rw-r--r--src/tutorial/Makefile2
-rw-r--r--src/tutorial/README2
-rw-r--r--src/tutorial/advanced.source4
-rw-r--r--src/tutorial/basics.source18
-rw-r--r--src/tutorial/complex.c2
-rw-r--r--src/tutorial/complex.source16
-rw-r--r--src/tutorial/funcs.c2
-rw-r--r--src/tutorial/funcs.source16
-rw-r--r--src/tutorial/funcs_new.c2
-rw-r--r--src/tutorial/syscat.source29
-rw-r--r--src/win32.mak8
2257 files changed, 256741 insertions, 103580 deletions
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000000..a66aac007a
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+/Makefile.global
+/Makefile.port
+/Makefile.custom
diff --git a/src/Makefile b/src/Makefile
index b30527e923..65ea50e71d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,7 +4,7 @@
#
# Copyright (c) 1994, Regents of the University of California
#
-# $PostgreSQL: pgsql/src/Makefile,v 1.50 2010/01/20 09:16:23 heikki Exp $
+# src/Makefile
#
#-------------------------------------------------------------------------
@@ -12,22 +12,26 @@ subdir = src
top_builddir = ..
include Makefile.global
+SUBDIRS = \
+ port \
+ timezone \
+ gtm \
+ backend \
+ backend/utils/mb/conversion_procs \
+ backend/snowball \
+ include \
+ interfaces \
+ backend/replication/libpqwalreceiver \
+ bin \
+ pl \
+ makefiles \
+ test/regress
-all install installdirs uninstall distprep:
- $(MAKE) -C port $@
- $(MAKE) -C timezone $@
- # GTM should be built before backend because of dependancy
- $(MAKE) -C gtm $@
- $(MAKE) -C interfaces $@
- $(MAKE) -C backend $@
- $(MAKE) -C backend/utils/mb/conversion_procs $@
- $(MAKE) -C backend/snowball $@
- $(MAKE) -C include $@
- $(MAKE) -C backend/replication/libpqwalreceiver $@
- $(MAKE) -C bin $@
- $(MAKE) -C pl $@
- $(MAKE) -C makefiles $@
- $(MAKE) -C test/regress $@
+# There are too many interdependencies between the subdirectories, so
+# don't attempt parallel make here.
+.NOTPARALLEL:
+
+$(recurse)
install: install-local
@@ -47,41 +51,25 @@ uninstall: uninstall-local
uninstall-local:
rm -f $(addprefix '$(DESTDIR)$(pgxsdir)/$(subdir)'/, Makefile.global Makefile.port Makefile.shlib nls-global.mk)
+distprep:
+ $(MAKE) -C test/isolation $@
+
clean:
- $(MAKE) -C port $@
- $(MAKE) -C timezone $@
- $(MAKE) -C gtm $@
- $(MAKE) -C backend $@
- $(MAKE) -C backend/snowball $@
- $(MAKE) -C include $@
- $(MAKE) -C interfaces $@
- $(MAKE) -C backend/replication/libpqwalreceiver $@
- $(MAKE) -C bin $@
- $(MAKE) -C pl $@
- $(MAKE) -C makefiles $@
$(MAKE) -C test $@
$(MAKE) -C tutorial NO_PGXS=1 $@
+ $(MAKE) -C test/isolation $@
$(MAKE) -C test/thread $@
distclean maintainer-clean:
- $(MAKE) -C port $@
- $(MAKE) -C timezone $@
- $(MAKE) -C gtm $@
- $(MAKE) -C backend $@
- $(MAKE) -C backend/snowball $@
- $(MAKE) -C include $@
- $(MAKE) -C interfaces $@
- $(MAKE) -C backend/replication/libpqwalreceiver $@
- $(MAKE) -C bin $@
- $(MAKE) -C pl $@
- $(MAKE) -C makefiles $@
$(MAKE) -C test $@
$(MAKE) -C tutorial NO_PGXS=1 $@
+ $(MAKE) -C test/isolation $@
$(MAKE) -C test/thread $@
rm -f Makefile.port Makefile.global
coverage:
$(MAKE) -C timezone $@
+ $(MAKE) -C gtm $@
$(MAKE) -C backend $@
$(MAKE) -C backend/utils/mb/conversion_procs $@
$(MAKE) -C backend/snowball $@
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index a82a301ee4..712ac58696 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -1,5 +1,5 @@
# -*-makefile-*-
-# $PostgreSQL: pgsql/src/Makefile.global.in,v 1.264 2010/07/06 22:03:05 tgl Exp $
+# src/Makefile.global.in
#------------------------------------------------------------------------------
# All PostgreSQL makefiles include this file and use the variables it sets,
@@ -18,7 +18,9 @@
#
# Meta configuration
-.PHONY: all install install-strip installdirs uninstall clean distclean maintainer-clean distprep check installcheck maintainer-check coverage html man installcheck-parallel world install-world installcheck-world
+standard_targets = all install installdirs uninstall distprep clean distclean maintainer-clean coverage check installcheck maintainer-check
+
+.PHONY: $(standard_targets) install-strip html man installcheck-parallel
# make `all' the default target
all:
@@ -156,6 +158,7 @@ with_python = @with_python@
with_tcl = @with_tcl@
with_openssl = @with_openssl@
with_ossp_uuid = @with_ossp_uuid@
+with_selinux = @with_selinux@
with_libxml = @with_libxml@
with_libxslt = @with_libxslt@
with_system_tzdata = @with_system_tzdata@
@@ -258,7 +261,7 @@ RANLIB = @RANLIB@
WINDRES = @WINDRES@
X = @EXEEXT@
-# Perl
+# Perl
ifneq (@PERL@,)
# quoted to protect pathname with spaces
@@ -284,9 +287,6 @@ XGETTEXT = @XGETTEXT@
GZIP = gzip
BZIP2 = bzip2
-PL_TESTDB = pl_regression
-CONTRIB_TESTDB = contrib_regression
-
# Installation.
INSTALL = $(SHELL) $(top_srcdir)/config/install-sh -c
@@ -363,6 +363,9 @@ ELF_SYSTEM= @ELF_SYS@
# Backend stack size limit has to be hard-wired on Windows (it's in bytes)
WIN32_STACK_RLIMIT=4194304
+# Set if we have a working win32 crashdump header
+have_win32_dbghelp = @have_win32_dbghelp@
+
# Pull in platform-specific magic
include $(top_builddir)/src/Makefile.port
@@ -391,7 +394,7 @@ endif
# This macro is for use by libraries linking to libpq. (Because libpgport
# isn't created with the same link flags as libpq, it can't be used.)
libpq = -L$(libpq_builddir) -lpq
-
+
# If doing static linking, shared library dependency info isn't available,
# so add in the libraries that libpq depends on.
ifeq ($(enable_shared), no)
@@ -400,9 +403,9 @@ libpq += $(filter -lintl -lssl -lcrypto -lkrb5 -lcrypt, $(LIBS)) \
endif
# This macro is for use by client executables (not libraries) that use libpq.
-# We force clients to pull symbols from the non-shared library libpgport
-# rather than pulling some libpgport symbols from libpq just because
-# libpq uses those functions too. This makes applications less
+# We force clients to pull symbols from the non-shared library libpgport
+# rather than pulling some libpgport symbols from libpq just because
+# libpq uses those functions too. This makes applications less
# dependent on changes in libpq's usage of pgport. To do this we link to
# pgport before libpq. This does cause duplicate -lpgport's to appear
# on client link lines.
@@ -424,6 +427,25 @@ submake-libpgport:
##########################################################################
#
+# Testing support
+
+PL_TESTDB = pl_regression
+CONTRIB_TESTDB = contrib_regression
+
+ifdef NO_LOCALE
+NOLOCALE += --no-locale
+endif
+
+pg_regress_locale_flags = $(if $(ENCODING),--encoding=$(ENCODING)) $(NOLOCALE)
+
+pg_regress_check = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --temp-install=./tmp_check --top-builddir=$(top_builddir) $(pg_regress_locale_flags)
+pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir=$(srcdir) --psqldir=$(PSQLDIR) $(pg_regress_locale_flags)
+
+pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/
+
+
+##########################################################################
+#
# Customization
#
# This includes your local customizations if Makefile.custom exists
@@ -517,7 +539,7 @@ $(top_builddir)/src/include/pg_config.h: $(top_builddir)/src/include/stamp-h
$(top_builddir)/src/include/stamp-h: $(top_srcdir)/src/include/pg_config.h.in $(top_builddir)/config.status
cd $(top_builddir) && ./config.status src/include/pg_config.h
-# Also remake ecpg_config.h from ecpg_config.h.in if the latter changed, same
+# Also remake ecpg_config.h from ecpg_config.h.in if the latter changed, same
# logic as above.
$(top_builddir)/src/interfaces/ecpg/include/ecpg_config.h: $(top_builddir)/src/interfaces/ecpg/include/stamp-h
@@ -542,6 +564,50 @@ install-strip:
##########################################################################
#
+# Recursive make support
+# ----------------------
+# Instead of recursing through subdirectories with a for loop or
+# repeated $(MAKE) -C whatever calls, this is a little smarter: it
+# allows parallel make across directories and lets make -k and -q work
+# correctly.
+
+# We need the $(eval) function and order-only prerequisites, which are
+# available in GNU make 3.80. That also happens to be the version
+# where the .VARIABLES variable was introduced, so this is a simple check.
+ifndef .VARIABLES
+$(error GNU make 3.80 or newer is required. You are using version $(MAKE_VERSION))
+endif
+
+# This function is only for internal use below. It should be called
+# using $(eval). It will set up a target so that it recurses into
+# a given subdirectory. Note that to avoid a nasty bug in make 3.80,
+# this function has to avoid using any complicated constructs (like
+# multiple targets on a line) and also not contain any lines that expand
+# to more than about 200 bytes. This is why we make it apply to just one
+# subdirectory at a time, rather than to a list of subdirectories.
+# $1: target name, e.g., all
+# $2: subdir name
+# $3: target to run in subdir, usually same as $1
+define _create_recursive_target
+.PHONY: $(1)-$(2)-recurse
+$(1): $(1)-$(2)-recurse
+$(1)-$(2)-recurse:
+ $$(MAKE) -C $(2) $(3)
+endef
+# Note that the use of $$ on the last line above is important; we want
+# $(MAKE) to be evaluated when the rule is run, not when the $(eval) is run
+# to create the rule. This is necessary to get make -q working.
+
+# Call this function in a makefile that needs to recurse into subdirectories.
+# In the normal case all arguments can be defaulted.
+# $1: targets to make recursive (defaults to list of standard targets)
+# $2: list of subdirs (defaults to SUBDIRS variable)
+# $3: target to run in subdir (defaults to current element of $1)
+recurse = $(foreach target,$(if $1,$1,$(standard_targets)),$(foreach subdir,$(if $2,$2,$(SUBDIRS)),$(eval $(call _create_recursive_target,$(target),$(subdir),$(if $3,$3,$(target))))))
+
+
+##########################################################################
+#
# Automatic dependency generation
# -------------------------------
# When we configure with --enable-depend then we override the default
@@ -582,6 +648,17 @@ clean distclean maintainer-clean: clean-deps
clean-deps:
@rm -rf $(DEPDIR)
+# When in automatic dependency mode, never delete any intermediate
+# files automatically. Otherwise, the following could happen: When
+# starting from a clean source tree, the first build would delete the
+# intermediate file, but also create the dependency file, which
+# mentions the intermediate file, thus making it non-intermediate.
+# The second build will then need to rebuild the now non-intermediate
+# missing file. So the second build will do work even though nothing
+# had changed. One place where this happens is the .c -> .o -> .so
+# chain for some contrib modules.
+.SECONDARY:
+
endif # autodepend
@@ -631,7 +708,6 @@ lcov.info: $(gcda_files)
$(GCOV) -b -f -p -o . $(GCOVFLAGS) $*.c >$*.c.gcov.out
coverage: $(gcda_files:.gcda=.c.gcov) lcov.info
- $(if $(SUBDIRS),for dir in $(SUBDIRS); do $(MAKE) -C $$dir coverage || exit; done)
.PHONY: coverage-html
coverage-html: coverage
diff --git a/src/Makefile.shlib b/src/Makefile.shlib
index f3dc2409b9..a5cf6c6c16 100644
--- a/src/Makefile.shlib
+++ b/src/Makefile.shlib
@@ -6,7 +6,7 @@
# Copyright (c) 1998, Regents of the University of California
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/Makefile.shlib,v 1.125 2010/07/06 03:55:33 tgl Exp $
+# src/Makefile.shlib
#
#-------------------------------------------------------------------------
@@ -22,6 +22,7 @@
# OBJS List of object files to include in library
# SHLIB_LINK If shared library relies on other libraries,
# additional stuff to put in its link command
+# SHLIB_PREREQS Order-only prerequisites for library build target
# SHLIB_EXPORTS (optional) Name of file containing list of symbols to
# export, in the format "function_name number"
#
@@ -270,7 +271,7 @@ endif
ifeq ($(PORTNAME), sunos4)
LINK.shared = $(LD) -assert pure-text -Bdynamic
endif
-
+
ifeq ($(PORTNAME), osf)
LINK.shared = $(LD) -shared -expect_unresolved '*'
endif
@@ -340,7 +341,7 @@ all-static-lib: $(stlib)
all-shared-lib: $(shlib)
ifndef haslibarule
-$(stlib): $(OBJS)
+$(stlib): $(OBJS) | $(SHLIB_PREREQS)
$(LINK.static) $@ $^
$(RANLIB) $@
endif #haslibarule
@@ -351,7 +352,7 @@ ifeq (,$(filter cygwin win32,$(PORTNAME)))
ifneq ($(PORTNAME), aix)
# Normal case
-$(shlib): $(OBJS)
+$(shlib): $(OBJS) | $(SHLIB_PREREQS)
$(LINK.shared) -o $@ $(OBJS) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK)
ifdef shlib_major
# If we're using major and minor versions, then make a symlink to major-version-only.
@@ -382,7 +383,7 @@ endif
else # PORTNAME == aix
# AIX case
-$(shlib) $(stlib): $(OBJS)
+$(shlib) $(stlib): $(OBJS) | $(SHLIB_PREREQS)
$(LINK.static) $(stlib) $^
$(RANLIB) $(stlib)
$(MKLDEXPORT) $(stlib) >$(exports_file)
@@ -408,10 +409,10 @@ else
DLL_DEFFILE = lib$(NAME)dll.def
endif
-$(shlib): $(OBJS) $(DLL_DEFFILE)
+$(shlib): $(OBJS) $(DLL_DEFFILE) | $(SHLIB_PREREQS)
$(DLLWRAP) -o $@ --dllname $(shlib) $(DLLWRAP_FLAGS) --def $(DLL_DEFFILE) $(OBJS) $(LDFLAGS) $(LDFLAGS_SL) $(SHLIB_LINK)
-$(stlib): $(shlib) $(DLL_DEFFILE)
+$(stlib): $(shlib) $(DLL_DEFFILE) | $(SHLIB_PREREQS)
$(DLLTOOL) --dllname $(shlib) $(DLLTOOL_LIBFLAGS) --def $(DLL_DEFFILE) --output-lib $@
endif # PORTNAME == cygwin || PORTNAME == win32
diff --git a/src/backend/.gitignore b/src/backend/.gitignore
new file mode 100644
index 0000000000..4e04ff0575
--- /dev/null
+++ b/src/backend/.gitignore
@@ -0,0 +1,2 @@
+/postgres
+/postgres.def
diff --git a/src/backend/Makefile b/src/backend/Makefile
index a707a94056..905ba06414 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -2,10 +2,10 @@
#
# Makefile for the postgres backend
#
-# Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+# Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
-# $PostgreSQL: pgsql/src/backend/Makefile,v 1.145 2010/07/05 18:54:37 tgl Exp $
+# src/backend/Makefile
#
#-------------------------------------------------------------------------
@@ -128,15 +128,20 @@ endif
endif # aix
# Update the commonly used headers before building the subdirectories
-$(SUBDIRS:%=%-recursive): $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/probes.h
+$(SUBDIRS:%=%-recursive): $(top_builddir)/src/include/parser/gram.h $(top_builddir)/src/include/catalog/schemapg.h $(top_builddir)/src/include/utils/fmgroids.h $(top_builddir)/src/include/utils/errcodes.h $(top_builddir)/src/include/utils/probes.h
# run this unconditionally to avoid needing to know its dependencies here:
submake-schemapg:
$(MAKE) -C catalog schemapg.h
-.PHONY: submake-schemapg
+# src/port needs a convenient way to force errcodes.h to get built
+submake-errcodes: $(top_builddir)/src/include/utils/errcodes.h
-catalog/schemapg.h: submake-schemapg
+.PHONY: submake-schemapg submake-errcodes
+
+catalog/schemapg.h: | submake-schemapg
+
+$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport
# The postgres.o target is needed by the rule in Makefile.global that
@@ -155,6 +160,9 @@ parser/gram.h: parser/gram.y
utils/fmgroids.h: utils/Gen_fmgrtab.pl catalog/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h
$(MAKE) -C utils fmgroids.h
+utils/errcodes.h: utils/generate-errcodes.pl utils/errcodes.txt
+ $(MAKE) -C utils errcodes.h
+
utils/probes.h: utils/probes.d
$(MAKE) -C utils probes.h
@@ -179,6 +187,10 @@ $(top_builddir)/src/include/catalog/schemapg.h: catalog/schemapg.h
cd $(dir $@) && rm -f $(notdir $@) && \
$(LN_S) "$$prereqdir/$(notdir $<)" .
+$(top_builddir)/src/include/utils/errcodes.h: utils/errcodes.h
+ cd $(dir $@) && rm -f $(notdir $@) && \
+ $(LN_S) ../../../$(subdir)/utils/errcodes.h .
+
$(top_builddir)/src/include/utils/fmgroids.h: utils/fmgroids.h
prereqdir=`cd $(dir $<) >/dev/null && pwd` && \
cd $(dir $@) && rm -f $(notdir $@) && \
@@ -195,11 +207,13 @@ utils/probes.o: utils/probes.d $(SUBDIROBJS)
##########################################################################
+# Be sure that these files get removed by the maintainer-clean target
distprep:
$(MAKE) -C parser gram.c gram.h scan.c
$(MAKE) -C bootstrap bootparse.c bootscanner.c
$(MAKE) -C catalog schemapg.h postgres.bki postgres.description postgres.shdescription
- $(MAKE) -C utils fmgrtab.c fmgroids.h
+ $(MAKE) -C replication repl_gram.c repl_scanner.c
+ $(MAKE) -C utils fmgrtab.c fmgroids.h errcodes.h
$(MAKE) -C utils/misc guc-file.c
@@ -295,20 +309,25 @@ ifeq ($(PORTNAME), win32)
endif
distclean: clean
- rm -f port/tas.s port/dynloader.c port/pg_sema.c port/pg_shmem.c
+ rm -f port/tas.s port/dynloader.c port/pg_sema.c port/pg_shmem.c \
+ port/pg_latch.c
maintainer-clean: distclean
rm -f bootstrap/bootparse.c \
bootstrap/bootscanner.c \
parser/gram.c \
- parser/scan.c \
parser/gram.h \
+ parser/scan.c \
catalog/schemapg.h \
catalog/postgres.bki \
catalog/postgres.description \
catalog/postgres.shdescription \
+ replication/repl_gram.c \
+ replication/repl_gram.h \
+ replication/repl_scanner.c \
utils/fmgroids.h \
utils/fmgrtab.c \
+ utils/errcodes.h \
utils/misc/guc-file.c
@@ -316,7 +335,7 @@ maintainer-clean: distclean
#
# Support for code development.
#
-# Use target "quick" to build "postgres" when you know all the subsystems
+# Use target "quick" to build "postgres" when you know all the subsystems
# are up to date. It saves the time of doing all the submakes.
.PHONY: quick
quick: $(OBJS)
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index cfbb5a5d35..a4c4ca7da9 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -1,7 +1,7 @@
#
# Makefile for the access methods module
#
-# $PostgreSQL: pgsql/src/backend/access/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
+# src/backend/access/Makefile
#
subdir = src/backend/access
diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index 011c60fce5..1fa6de0823 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/common
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/common/Makefile,v 1.26 2009/08/06 20:44:31 tgl Exp $
+# src/backend/access/common/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index d34f00261a..cb1f8f89fd 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -45,12 +45,12 @@
* and we'd like to still refer to them via C struct offsets.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/heaptuple.c,v 1.130 2010/01/10 04:26:36 rhaas Exp $
+ * src/backend/access/common/heaptuple.c
*
*-------------------------------------------------------------------------
*/
@@ -353,7 +353,7 @@ nocachegetattr(HeapTuple tuple,
*
* check to see if any preceding bits are null...
*/
- int byte = attnum >> 3;
+ int byte = attnum >> 3;
int finalbit = attnum & 0x07;
/* check for nulls "before" final bit of last byte */
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index 3ce377b854..85c43199aa 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -4,12 +4,12 @@
* This file contains index tuple accessor and mutator routines,
* as well as various tuple utilities.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/indextuple.c,v 1.91 2010/01/10 04:26:36 rhaas Exp $
+ * src/backend/access/common/indextuple.c
*
*-------------------------------------------------------------------------
*/
@@ -237,7 +237,7 @@ nocache_index_getattr(IndexTuple tup,
* Now check to see if any preceding bits are null...
*/
{
- int byte = attnum >> 3;
+ int byte = attnum >> 3;
int finalbit = attnum & 0x07;
/* check for nulls "before" final bit of last byte */
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 8ff4f6aad5..8b5f4a862f 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -5,11 +5,11 @@
* clients and standalone backends are supported here).
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/printtup.c,v 1.106 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/common/printtup.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index cd4f59005a..465742556f 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -3,12 +3,12 @@
* reloptions.c
* Core support for relation options (pg_class.reloptions)
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/reloptions.c,v 1.35 2010/06/07 02:59:02 itagaki Exp $
+ * src/backend/access/common/reloptions.c
*
*-------------------------------------------------------------------------
*/
@@ -782,6 +782,9 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
case RELKIND_INDEX:
options = index_reloptions(amoptions, datum, false);
break;
+ case RELKIND_FOREIGN_TABLE:
+ options = NULL;
+ break;
default:
Assert(false); /* can't get here */
options = NULL; /* keep compiler quiet */
@@ -1174,7 +1177,7 @@ heap_reloptions(char relkind, Datum reloptions, bool validate)
case RELKIND_RELATION:
return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP);
default:
- /* sequences, composite types and views are not supported */
+ /* other relkinds are not supported */
return NULL;
}
}
@@ -1203,7 +1206,7 @@ index_reloptions(RegProcedure amoptions, Datum reloptions, bool validate)
/* Can't use OidFunctionCallN because we might get a NULL result */
fmgr_info(amoptions, &flinfo);
- InitFunctionCallInfoData(fcinfo, &flinfo, 2, NULL, NULL);
+ InitFunctionCallInfoData(fcinfo, &flinfo, 2, InvalidOid, NULL, NULL);
fcinfo.arg[0] = reloptions;
fcinfo.arg[1] = BoolGetDatum(validate);
diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c
index 23d6b88d41..c879b8aaa4 100644
--- a/src/backend/access/common/scankey.c
+++ b/src/backend/access/common/scankey.c
@@ -3,18 +3,19 @@
* scankey.c
* scan key support code
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/scankey.c,v 1.34 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/common/scankey.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/skey.h"
+#include "catalog/pg_collation.h"
/*
@@ -33,6 +34,7 @@ ScanKeyEntryInitialize(ScanKey entry,
AttrNumber attributeNumber,
StrategyNumber strategy,
Oid subtype,
+ Oid collation,
RegProcedure procedure,
Datum argument)
{
@@ -40,9 +42,12 @@ ScanKeyEntryInitialize(ScanKey entry,
entry->sk_attno = attributeNumber;
entry->sk_strategy = strategy;
entry->sk_subtype = subtype;
+ entry->sk_collation = collation;
entry->sk_argument = argument;
if (RegProcedureIsValid(procedure))
+ {
fmgr_info(procedure, &entry->sk_func);
+ }
else
{
Assert(flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
@@ -53,12 +58,16 @@ ScanKeyEntryInitialize(ScanKey entry,
/*
* ScanKeyInit
* Shorthand version of ScanKeyEntryInitialize: flags and subtype
- * are assumed to be zero (the usual value).
+ * are assumed to be zero (the usual value), and collation is defaulted.
*
* This is the recommended version for hardwired lookups in system catalogs.
* It cannot handle NULL arguments, unary operators, or nondefault operators,
* but we need none of those features for most hardwired lookups.
*
+ * We set collation to DEFAULT_COLLATION_OID always. This is appropriate
+ * for textual columns in system catalogs, and it will be ignored for
+ * non-textual columns, so it's not worth trying to be more finicky.
+ *
* Note: CurrentMemoryContext at call should be as long-lived as the ScanKey
* itself, because that's what will be used for any subsidiary info attached
* to the ScanKey's FmgrInfo record.
@@ -74,6 +83,7 @@ ScanKeyInit(ScanKey entry,
entry->sk_attno = attributeNumber;
entry->sk_strategy = strategy;
entry->sk_subtype = InvalidOid;
+ entry->sk_collation = DEFAULT_COLLATION_OID;
entry->sk_argument = argument;
fmgr_info(procedure, &entry->sk_func);
}
@@ -93,6 +103,7 @@ ScanKeyEntryInitializeWithInfo(ScanKey entry,
AttrNumber attributeNumber,
StrategyNumber strategy,
Oid subtype,
+ Oid collation,
FmgrInfo *finfo,
Datum argument)
{
@@ -100,6 +111,7 @@ ScanKeyEntryInitializeWithInfo(ScanKey entry,
entry->sk_attno = attributeNumber;
entry->sk_strategy = strategy;
entry->sk_subtype = subtype;
+ entry->sk_collation = collation;
entry->sk_argument = argument;
fmgr_info_copy(&entry->sk_func, finfo, CurrentMemoryContext);
}
diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index fec3b3ef58..34e5f11440 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -9,12 +9,12 @@
* executor's "junkfilter" routines, but these functions work on bare
* HeapTuples rather than TupleTableSlots.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/tupconvert.c,v 1.4 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/common/tupconvert.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index 2125fdfb0b..16979c4ea7 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -3,12 +3,12 @@
* tupdesc.c
* POSTGRES tuple descriptor support code
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/common/tupdesc.c,v 1.133 2010/02/14 18:42:12 rhaas Exp $
+ * src/backend/access/common/tupdesc.c
*
* NOTES
* some of the executor utility code such as "ExecTypeFromTL" should be
@@ -360,6 +360,8 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
return false;
if (attr1->attinhcount != attr2->attinhcount)
return false;
+ if (attr1->attcollation != attr2->attcollation)
+ return false;
/* attacl and attoptions are not even present... */
}
@@ -427,6 +429,10 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
* TupleDescInitEntry
* This function initializes a single attribute structure in
* a previously allocated tuple descriptor.
+ *
+ * Note that attcollation is set to the default for the specified datatype.
+ * If a nondefault collation is needed, insert it afterwards using
+ * TupleDescInitEntryCollation.
*/
void
TupleDescInitEntry(TupleDesc desc,
@@ -488,10 +494,32 @@ TupleDescInitEntry(TupleDesc desc,
att->attbyval = typeForm->typbyval;
att->attalign = typeForm->typalign;
att->attstorage = typeForm->typstorage;
+ att->attcollation = typeForm->typcollation;
ReleaseSysCache(tuple);
}
+/*
+ * TupleDescInitEntryCollation
+ *
+ * Assign a nondefault collation to a previously initialized tuple descriptor
+ * entry.
+ */
+void
+TupleDescInitEntryCollation(TupleDesc desc,
+ AttrNumber attributeNumber,
+ Oid collationid)
+{
+ /*
+ * sanity checks
+ */
+ AssertArg(PointerIsValid(desc));
+ AssertArg(attributeNumber >= 1);
+ AssertArg(attributeNumber <= desc->natts);
+
+ desc->attrs[attributeNumber - 1]->attcollation = collationid;
+}
+
/*
* BuildDescForRelation
@@ -513,6 +541,7 @@ BuildDescForRelation(List *schema)
char *attname;
Oid atttypid;
int32 atttypmod;
+ Oid attcollation;
int attdim;
/*
@@ -536,7 +565,8 @@ BuildDescForRelation(List *schema)
attnum++;
attname = entry->colname;
- atttypid = typenameTypeId(NULL, entry->typeName, &atttypmod);
+ typenameTypeIdAndMod(NULL, entry->typeName, &atttypid, &atttypmod);
+ attcollation = GetColumnDefCollation(NULL, entry, atttypid);
attdim = list_length(entry->typeName->arrayBounds);
if (entry->typeName->setof)
@@ -549,6 +579,7 @@ BuildDescForRelation(List *schema)
atttypid, atttypmod, attdim);
/* Override TupleDescInitEntry's settings as requested */
+ TupleDescInitEntryCollation(desc, attnum, attcollation);
if (entry->storage)
desc->attrs[attnum - 1]->attstorage = entry->storage;
@@ -582,24 +613,28 @@ BuildDescForRelation(List *schema)
* BuildDescFromLists
*
* Build a TupleDesc given lists of column names (as String nodes),
- * column type OIDs, and column typmods. No constraints are generated.
+ * column type OIDs, typmods, and collation OIDs.
+ *
+ * No constraints are generated.
*
* This is essentially a cut-down version of BuildDescForRelation for use
* with functions returning RECORD.
*/
TupleDesc
-BuildDescFromLists(List *names, List *types, List *typmods)
+BuildDescFromLists(List *names, List *types, List *typmods, List *collations)
{
int natts;
AttrNumber attnum;
ListCell *l1;
ListCell *l2;
ListCell *l3;
+ ListCell *l4;
TupleDesc desc;
natts = list_length(names);
Assert(natts == list_length(types));
Assert(natts == list_length(typmods));
+ Assert(natts == list_length(collations));
/*
* allocate a new tuple descriptor
@@ -610,20 +645,25 @@ BuildDescFromLists(List *names, List *types, List *typmods)
l2 = list_head(types);
l3 = list_head(typmods);
+ l4 = list_head(collations);
foreach(l1, names)
{
char *attname = strVal(lfirst(l1));
Oid atttypid;
int32 atttypmod;
+ Oid attcollation;
atttypid = lfirst_oid(l2);
l2 = lnext(l2);
atttypmod = lfirst_int(l3);
l3 = lnext(l3);
+ attcollation = lfirst_oid(l4);
+ l4 = lnext(l4);
attnum++;
TupleDescInitEntry(desc, attnum, attname, atttypid, atttypmod, 0);
+ TupleDescInitEntryCollation(desc, attnum, attcollation);
}
return desc;
diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile
index 23b75fc1d8..889dde6a27 100644
--- a/src/backend/access/gin/Makefile
+++ b/src/backend/access/gin/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/gin
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/gin/Makefile,v 1.4 2009/03/24 20:17:10 tgl Exp $
+# src/backend/access/gin/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index cd406935e0..67159d8529 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/gin/README,v 1.7 2010/02/08 04:33:52 tgl Exp $
+src/backend/access/gin/README
Gin for PostgreSQL
==================
@@ -9,29 +9,29 @@ Gin stands for Generalized Inverted Index and should be considered as a genie,
not a drink.
Generalized means that the index does not know which operation it accelerates.
-It instead works with custom strategies, defined for specific data types (read
-"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin
+It instead works with custom strategies, defined for specific data types (read
+"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin
is similar to GiST and differs from btree indices, which have predefined,
comparison-based operations.
-An inverted index is an index structure storing a set of (key, posting list)
-pairs, where 'posting list' is a set of documents in which the key occurs.
-(A text document would usually contain many keys.) The primary goal of
+An inverted index is an index structure storing a set of (key, posting list)
+pairs, where 'posting list' is a set of heap rows in which the key occurs.
+(A text document would usually contain many keys.) The primary goal of
Gin indices is support for highly scalable, full-text search in PostgreSQL.
-Gin consists of a B-tree index constructed over entries (ET, entries tree),
-where each entry is an element of the indexed value (element of array, lexeme
-for tsvector) and where each tuple in a leaf page is either a pointer to a
-B-tree over item pointers (PT, posting tree), or a list of item pointers
-(PL, posting list) if the tuple is small enough.
+A Gin index consists of a B-tree index constructed over key values,
+where each key is an element of some indexed items (element of array, lexeme
+for tsvector) and where each tuple in a leaf page contains either a pointer to
+a B-tree over item pointers (posting tree), or a simple list of item pointers
+(posting list) if the list is small enough.
-Note: There is no delete operation for ET. The reason for this is that in
-our experience, the set of distinct words in a large corpus changes very
-rarely. This greatly simplifies the code and concurrency algorithms.
+Note: There is no delete operation in the key (entry) tree. The reason for
+this is that in our experience, the set of distinct words in a large corpus
+changes very slowly. This greatly simplifies the code and concurrency
+algorithms.
-Gin comes with built-in support for one-dimensional arrays (eg. integer[],
-text[]), but no support for NULL elements. The following operations are
-available:
+Core PostgreSQL includes built-in Gin support for one-dimensional arrays
+(eg. integer[], text[]). The following operations are available:
* contains: value_array @> query_array
* overlaps: value_array && query_array
@@ -59,61 +59,177 @@ Gin Fuzzy Limit
There are often situations when a full-text search returns a very large set of
results. Since reading tuples from the disk and sorting them could take a
-lot of time, this is unacceptable for production. (Note that the search
+lot of time, this is unacceptable for production. (Note that the search
itself is very fast.)
-Such queries usually contain very frequent lexemes, so the results are not
-very helpful. To facilitate execution of such queries Gin has a configurable
-soft upper limit on the size of the returned set, determined by the
-'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no
+Such queries usually contain very frequent lexemes, so the results are not
+very helpful. To facilitate execution of such queries Gin has a configurable
+soft upper limit on the size of the returned set, determined by the
+'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no
limit).
If a non-zero search limit is set, then the returned set is a subset of the
whole result set, chosen at random.
-"Soft" means that the actual number of returned results could slightly differ
-from the specified limit, depending on the query and the quality of the
+"Soft" means that the actual number of returned results could differ
+from the specified limit, depending on the query and the quality of the
system's random number generator.
From experience, a value of 'gin_fuzzy_search_limit' in the thousands
(eg. 5000-20000) works well. This means that 'gin_fuzzy_search_limit' will
-have no effect for queries returning a result set with less tuples than this
+have no effect for queries returning a result set with less tuples than this
number.
-Limitations
------------
-
- * No support for multicolumn indices
- * Gin doesn't uses scan->kill_prior_tuple & scan->ignore_killed_tuples
- * Gin searches entries only by equality matching. This may be improved in
- future.
- * Gin doesn't support full scans of indices.
- * Gin doesn't index NULL values.
+Index structure
+---------------
-Open Items
-----------
+The "items" that a GIN index indexes are composite values that contain
+zero or more "keys". For example, an item might be an integer array, and
+then the keys would be the individual integer values. The index actually
+stores and searches for the key values, not the items per se. In the
+pg_opclass entry for a GIN opclass, the opcintype is the data type of the
+items, and the opckeytype is the data type of the keys. GIN is optimized
+for cases where items contain many keys and the same key values appear
+in many different items.
+
+A GIN index contains a metapage, a btree of key entries, and possibly
+"posting tree" pages, which hold the overflow when a key entry acquires
+too many heap tuple pointers to fit in a btree page. Additionally, if the
+fast-update feature is enabled, there can be "list pages" holding "pending"
+key entries that haven't yet been merged into the main btree. The list
+pages have to be scanned linearly when doing a search, so the pending
+entries should be merged into the main btree before there get to be too
+many of them. The advantage of the pending list is that bulk insertion of
+a few thousand entries can be much faster than retail insertion. (The win
+comes mainly from not having to do multiple searches/insertions when the
+same key appears in multiple new heap tuples.)
+
+Key entries are nominally of the same IndexEntry format as used in other
+index types, but since a leaf key entry typically refers to multiple heap
+tuples, there are significant differences. (See GinFormTuple, which works
+by building a "normal" index tuple and then modifying it.) The points to
+know are:
+
+* In a single-column index, a key tuple just contains the key datum, but
+in a multi-column index, a key tuple contains the pair (column number,
+key datum) where the column number is stored as an int2. This is needed
+to support different key data types in different columns. This much of
+the tuple is built by index_form_tuple according to the usual rules.
+The column number (if present) can never be null, but the key datum can
+be, in which case a null bitmap is present as usual. (As usual for index
+tuples, the size of the null bitmap is fixed at INDEX_MAX_KEYS.)
+
+* If the key datum is null (ie, IndexTupleHasNulls() is true), then
+just after the nominal index data (ie, at offset IndexInfoFindDataOffset
+or IndexInfoFindDataOffset + sizeof(int2)) there is a byte indicating
+the "category" of the null entry. These are the possible categories:
+ 1 = ordinary null key value extracted from an indexable item
+ 2 = placeholder for zero-key indexable item
+ 3 = placeholder for null indexable item
+Placeholder null entries are inserted into the index because otherwise
+there would be no index entry at all for an empty or null indexable item,
+which would mean that full index scans couldn't be done and various corner
+cases would give wrong answers. The different categories of null entries
+are treated as distinct keys by the btree, but heap itempointers for the
+same category of null entry are merged into one index entry just as happens
+with ordinary key entries.
+
+* In a key entry at the btree leaf level, at the next SHORTALIGN boundary,
+there is an array of zero or more ItemPointers, which store the heap tuple
+TIDs for which the indexable items contain this key. This is called the
+"posting list". The TIDs in a posting list must appear in sorted order.
+If the list would be too big for the index tuple to fit on an index page,
+the ItemPointers are pushed out to a separate posting page or pages, and
+none appear in the key entry itself. The separate pages are called a
+"posting tree"; they are organized as a btree of ItemPointer values.
+Note that in either case, the ItemPointers associated with a key can
+easily be read out in sorted order; this is relied on by the scan
+algorithms.
+
+* The index tuple header fields of a leaf key entry are abused as follows:
+
+1) Posting list case:
+
+* ItemPointerGetBlockNumber(&itup->t_tid) contains the offset from index
+ tuple start to the posting list.
+ Access macros: GinGetPostingOffset(itup) / GinSetPostingOffset(itup,n)
+
+* ItemPointerGetOffsetNumber(&itup->t_tid) contains the number of elements
+ in the posting list (number of heap itempointers).
+ Access macros: GinGetNPosting(itup) / GinSetNPosting(itup,n)
+
+* If IndexTupleHasNulls(itup) is true, the null category byte can be
+ accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c)
+
+* The posting list can be accessed with GinGetPosting(itup)
+
+2) Posting tree case:
+
+* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number
+ of the root of the posting tree.
+ Access macros: GinGetPostingTree(itup) / GinSetPostingTree(itup, blkno)
+
+* ItemPointerGetOffsetNumber(&itup->t_tid) contains the magic number
+ GIN_TREE_POSTING, which distinguishes this from the posting-list case
+ (it's large enough that that many heap itempointers couldn't possibly
+ fit on an index page). This value is inserted automatically by the
+ GinSetPostingTree macro.
+
+* If IndexTupleHasNulls(itup) is true, the null category byte can be
+ accessed/set with GinGetNullCategory(itup) / GinSetNullCategory(itup,c)
+
+* The posting list is not present and must not be accessed.
+
+Use the macro GinIsPostingTree(itup) to determine which case applies.
+
+In both cases, itup->t_info & INDEX_SIZE_MASK contains actual total size of
+tuple, and the INDEX_VAR_MASK and INDEX_NULL_MASK bits have their normal
+meanings as set by index_form_tuple.
+
+Index tuples in non-leaf levels of the btree contain the optional column
+number, key datum, and null category byte as above. They do not contain
+a posting list. ItemPointerGetBlockNumber(&itup->t_tid) is the downlink
+to the next lower btree level, and ItemPointerGetOffsetNumber(&itup->t_tid)
+is InvalidOffsetNumber. Use the access macros GinGetDownlink/GinSetDownlink
+to get/set the downlink.
+
+Index entries that appear in "pending list" pages work a tad differently as
+well. The optional column number, key datum, and null category byte are as
+for other GIN index entries. However, there is always exactly one heap
+itempointer associated with a pending entry, and it is stored in the t_tid
+header field just as in non-GIN indexes. There is no posting list.
+Furthermore, the code that searches the pending list assumes that all
+entries for a given heap tuple appear consecutively in the pending list and
+are sorted by the column-number-plus-key-datum. The GIN_LIST_FULLROW page
+flag bit tells whether entries for a given heap tuple are spread across
+multiple pending-list pages. If GIN_LIST_FULLROW is set, the page contains
+all the entries for one or more heap tuples. If GIN_LIST_FULLROW is clear,
+the page contains entries for only one heap tuple, *and* they are not all
+the entries for that tuple. (Thus, a heap tuple whose entries do not all
+fit on one pending-list page must have those pages to itself, even if this
+results in wasting much of the space on the preceding page and the last
+page for the tuple.)
-We appreciate any comments, help and suggestions.
+Limitations
+-----------
- * Teach optimizer/executor that GIN is intrinsically clustered. i.e., it
- always returns ItemPointer in ascending order.
- * Tweak gincostestimate.
+ * Gin doesn't use scan->kill_prior_tuple & scan->ignore_killed_tuples
+ * Gin searches entries only by equality matching, or simple range
+ matching using the "partial match" feature.
TODO
----
Nearest future:
- * Opclasses for all types (no programming, just many catalog changes).
+ * Opclasses for more types (no programming, just many catalog changes)
Distant future:
* Replace B-tree of entries to something like GiST
- * Add multicolumn support
- * Optimize insert operations (background index insertion)
Authors
-------
-All work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov
+Original work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov
(oleg@sai.msu.su).
diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c
index db5c9e3d19..2de58604ee 100644
--- a/src/backend/access/gin/ginarrayproc.c
+++ b/src/backend/access/gin/ginarrayproc.c
@@ -4,17 +4,19 @@
* support functions for GIN's indexing of any array
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginarrayproc.c,v 1.17 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/gin/ginarrayproc.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/gin.h"
+#include "access/skey.h"
#include "utils/array.h"
+#include "utils/builtins.h"
#include "utils/lsyscache.h"
@@ -23,34 +25,23 @@
#define GinContainedStrategy 3
#define GinEqualStrategy 4
-#define ARRAYCHECK(x) do { \
- if ( ARR_HASNULL(x) ) \
- ereport(ERROR, \
- (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \
- errmsg("array must not contain null values"))); \
-} while(0)
-
/*
- * Function used as extractValue and extractQuery both
+ * extractValue support function
*/
Datum
ginarrayextract(PG_FUNCTION_ARGS)
{
- ArrayType *array;
- int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
- Datum *entries = NULL;
+ /* Make copy of array input to ensure it doesn't disappear while in use */
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0);
+ int32 *nkeys = (int32 *) PG_GETARG_POINTER(1);
+ bool **nullFlags = (bool **) PG_GETARG_POINTER(2);
int16 elmlen;
bool elmbyval;
char elmalign;
-
- /*
- * we should guarantee that array will not be destroyed during all
- * operation
- */
- array = PG_GETARG_ARRAYTYPE_P_COPY(0);
-
- ARRAYCHECK(array);
+ Datum *elems;
+ bool *nulls;
+ int nelems;
get_typlenbyvalalign(ARR_ELEMTYPE(array),
&elmlen, &elmbyval, &elmalign);
@@ -58,89 +49,159 @@ ginarrayextract(PG_FUNCTION_ARGS)
deconstruct_array(array,
ARR_ELEMTYPE(array),
elmlen, elmbyval, elmalign,
- &entries, NULL, (int *) nentries);
+ &elems, &nulls, &nelems);
- if (*nentries == 0 && PG_NARGS() == 3)
- {
- switch (PG_GETARG_UINT16(2)) /* StrategyNumber */
- {
- case GinOverlapStrategy:
- *nentries = -1; /* nobody can be found */
- break;
- case GinContainsStrategy:
- case GinContainedStrategy:
- case GinEqualStrategy:
- default: /* require fullscan: GIN can't find void
- * arrays */
- break;
- }
- }
+ *nkeys = nelems;
+ *nullFlags = nulls;
- /* we should not free array, entries[i] points into it */
- PG_RETURN_POINTER(entries);
+ /* we should not free array, elems[i] points into it */
+ PG_RETURN_POINTER(elems);
}
+/*
+ * Formerly, ginarrayextract had only two arguments. Now it has three,
+ * but we still need a pg_proc entry with two args to support reloading
+ * pre-9.1 contrib/intarray opclass declarations. This compatibility
+ * function should go away eventually.
+ */
+Datum
+ginarrayextract_2args(PG_FUNCTION_ARGS)
+{
+ if (PG_NARGS() < 3) /* should not happen */
+ elog(ERROR, "ginarrayextract requires three arguments");
+ return ginarrayextract(fcinfo);
+}
+
+/*
+ * extractQuery support function
+ */
Datum
ginqueryarrayextract(PG_FUNCTION_ARGS)
{
- PG_RETURN_DATUM(DirectFunctionCall3(ginarrayextract,
- PG_GETARG_DATUM(0),
- PG_GETARG_DATUM(1),
- PG_GETARG_DATUM(2)));
+ /* Make copy of array input to ensure it doesn't disappear while in use */
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0);
+ int32 *nkeys = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+
+ /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ bool **nullFlags = (bool **) PG_GETARG_POINTER(5);
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Datum *elems;
+ bool *nulls;
+ int nelems;
+
+ get_typlenbyvalalign(ARR_ELEMTYPE(array),
+ &elmlen, &elmbyval, &elmalign);
+
+ deconstruct_array(array,
+ ARR_ELEMTYPE(array),
+ elmlen, elmbyval, elmalign,
+ &elems, &nulls, &nelems);
+
+ *nkeys = nelems;
+ *nullFlags = nulls;
+
+ switch (strategy)
+ {
+ case GinOverlapStrategy:
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ break;
+ case GinContainsStrategy:
+ if (nelems > 0)
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ else /* everything contains the empty set */
+ *searchMode = GIN_SEARCH_MODE_ALL;
+ break;
+ case GinContainedStrategy:
+ /* empty set is contained in everything */
+ *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY;
+ break;
+ case GinEqualStrategy:
+ if (nelems > 0)
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ else
+ *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY;
+ break;
+ default:
+ elog(ERROR, "ginqueryarrayextract: unknown strategy number: %d",
+ strategy);
+ }
+
+ /* we should not free array, elems[i] points into it */
+ PG_RETURN_POINTER(elems);
}
+/*
+ * consistent support function
+ */
Datum
ginarrayconsistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
StrategyNumber strategy = PG_GETARG_UINT16(1);
- ArrayType *query = PG_GETARG_ARRAYTYPE_P(2);
- /* int32 nkeys = PG_GETARG_INT32(3); */
+ /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
- bool res;
- int i,
- nentries;
- /* ARRAYCHECK was already done by previous ginarrayextract call */
+ /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */
+ bool *nullFlags = (bool *) PG_GETARG_POINTER(7);
+ bool res;
+ int32 i;
switch (strategy)
{
case GinOverlapStrategy:
/* result is not lossy */
*recheck = false;
- /* at least one element in check[] is true, so result = true */
- res = true;
- break;
- case GinContainedStrategy:
- /* we will need recheck */
- *recheck = true;
- /* at least one element in check[] is true, so result = true */
- res = true;
+ /* must have a match for at least one non-null element */
+ res = false;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] && !nullFlags[i])
+ {
+ res = true;
+ break;
+ }
+ }
break;
case GinContainsStrategy:
/* result is not lossy */
*recheck = false;
- /* must have all elements in check[] true */
- nentries = ArrayGetNItems(ARR_NDIM(query), ARR_DIMS(query));
+ /* must have all elements in check[] true, and no nulls */
res = true;
- for (i = 0; i < nentries; i++)
+ for (i = 0; i < nkeys; i++)
{
- if (!check[i])
+ if (!check[i] || nullFlags[i])
{
res = false;
break;
}
}
break;
+ case GinContainedStrategy:
+ /* we will need recheck */
+ *recheck = true;
+ /* can't do anything else useful here */
+ res = true;
+ break;
case GinEqualStrategy:
/* we will need recheck */
*recheck = true;
- /* must have all elements in check[] true */
- nentries = ArrayGetNItems(ARR_NDIM(query), ARR_DIMS(query));
+
+ /*
+ * Must have all elements in check[] true; no discrimination
+ * against nulls here. This is because array_contain_compare and
+ * array_eq handle nulls differently ...
+ */
res = true;
- for (i = 0; i < nentries; i++)
+ for (i = 0; i < nkeys; i++)
{
if (!check[i])
{
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index bb150a4689..739fa8afff 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -4,17 +4,17 @@
* page utilities routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginbtree.c,v 1.15 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/gin/ginbtree.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "utils/rel.h"
@@ -104,7 +104,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
* ok, page is correctly locked, we should check to move right ..,
* root never has a right link, so small optimization
*/
- while (btree->fullScan == FALSE && stack->blkno != rootBlkno && btree->isMoveRight(btree, page))
+ while (btree->fullScan == FALSE && stack->blkno != rootBlkno &&
+ btree->isMoveRight(btree, page))
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
@@ -173,8 +174,8 @@ freeGinBtreeStack(GinBtreeStack *stack)
* with vacuum process
*/
void
-findParents(GinBtree btree, GinBtreeStack *stack,
- BlockNumber rootBlkno)
+ginFindParents(GinBtree btree, GinBtreeStack *stack,
+ BlockNumber rootBlkno)
{
Page page;
@@ -226,7 +227,6 @@ findParents(GinBtree btree, GinBtreeStack *stack,
LockBuffer(root->buffer, GIN_UNLOCK);
Assert(blkno != InvalidBlockNumber);
-
for (;;)
{
buffer = ReadBuffer(btree->index, blkno);
@@ -267,9 +267,14 @@ findParents(GinBtree btree, GinBtreeStack *stack,
/*
* Insert value (stored in GinBtree) to tree described by stack
+ *
+ * During an index build, buildStats is non-null and the counters
+ * it contains should be incremented as needed.
+ *
+ * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
*/
void
-ginInsertValue(GinBtree btree, GinBtreeStack *stack)
+ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
{
GinBtreeStack *parent = stack;
BlockNumber rootBlkno = InvalidBuffer;
@@ -299,7 +304,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
MarkBufferDirty(stack->buffer);
- if (!btree->index->rd_istemp)
+ if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
@@ -308,10 +313,11 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
PageSetTLI(page, ThisTimeLineID);
}
- UnlockReleaseBuffer(stack->buffer);
+ LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
- freeGinBtreeStack(stack->parent);
+ freeGinBtreeStack(stack);
+
return;
}
else
@@ -325,9 +331,17 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
*/
newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
-
((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
+ /* During index build, count the newly-split page */
+ if (buildStats)
+ {
+ if (btree->isData)
+ buildStats->nDataPages++;
+ else
+ buildStats->nEntryPages++;
+ }
+
parent = stack->parent;
if (parent == NULL)
@@ -341,7 +355,6 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
-
page = BufferGetPage(stack->buffer);
lpage = BufferGetPage(lbuffer);
rpage = BufferGetPage(rbuffer);
@@ -360,7 +373,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
MarkBufferDirty(lbuffer);
MarkBufferDirty(stack->buffer);
- if (!btree->index->rd_istemp)
+ if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
@@ -375,10 +388,20 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
- UnlockReleaseBuffer(stack->buffer);
-
+ LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
+ freeGinBtreeStack(stack);
+
+ /* During index build, count the newly-added root page */
+ if (buildStats)
+ {
+ if (btree->isData)
+ buildStats->nDataPages++;
+ else
+ buildStats->nEntryPages++;
+ }
+
return;
}
else
@@ -399,7 +422,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
MarkBufferDirty(rbuffer);
MarkBufferDirty(stack->buffer);
- if (!btree->index->rd_istemp)
+ if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
@@ -433,7 +456,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
* rightmost page, but we don't find parent, we should use
* plain search...
*/
- findParents(btree, stack, rootBlkno);
+ ginFindParents(btree, stack, rootBlkno);
parent = stack->parent;
page = BufferGetPage(parent->buffer);
break;
diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c
index bb726e69f4..9e5bab194d 100644
--- a/src/backend/access/gin/ginbulk.c
+++ b/src/backend/access/gin/ginbulk.c
@@ -4,83 +4,117 @@
* routines for fast build of inverted index
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.19 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gin/ginbulk.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "utils/datum.h"
#include "utils/memutils.h"
-#define DEF_NENTRY 2048
-#define DEF_NPTR 4
+#define DEF_NENTRY 2048 /* GinEntryAccumulator allocation quantum */
+#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */
-static void *
-ginAppendData(void *old, void *new, void *arg)
-{
- EntryAccumulator *eo = (EntryAccumulator *) old,
- *en = (EntryAccumulator *) new;
+/* Combiner function for rbtree.c */
+static void
+ginCombineData(RBNode *existing, const RBNode *newdata, void *arg)
+{
+ GinEntryAccumulator *eo = (GinEntryAccumulator *) existing;
+ const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata;
BuildAccumulator *accum = (BuildAccumulator *) arg;
- if (eo->number >= eo->length)
+ /*
+ * Note this code assumes that newdata contains only one itempointer.
+ */
+ if (eo->count >= eo->maxcount)
{
accum->allocatedMemory -= GetMemoryChunkSpace(eo->list);
- eo->length *= 2;
- eo->list = (ItemPointerData *) repalloc(eo->list,
- sizeof(ItemPointerData) * eo->length);
+ eo->maxcount *= 2;
+ eo->list = (ItemPointerData *)
+ repalloc(eo->list, sizeof(ItemPointerData) * eo->maxcount);
accum->allocatedMemory += GetMemoryChunkSpace(eo->list);
}
- /* If item pointers are not ordered, they will need to be sorted. */
+ /* If item pointers are not ordered, they will need to be sorted later */
if (eo->shouldSort == FALSE)
{
int res;
- res = compareItemPointers(eo->list + eo->number - 1, en->list);
+ res = ginCompareItemPointers(eo->list + eo->count - 1, en->list);
Assert(res != 0);
if (res > 0)
eo->shouldSort = TRUE;
}
- eo->list[eo->number] = en->list[0];
- eo->number++;
-
- return old;
+ eo->list[eo->count] = en->list[0];
+ eo->count++;
}
+/* Comparator function for rbtree.c */
static int
-cmpEntryAccumulator(const void *a, const void *b, void *arg)
+cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg)
+{
+ const GinEntryAccumulator *ea = (const GinEntryAccumulator *) a;
+ const GinEntryAccumulator *eb = (const GinEntryAccumulator *) b;
+ BuildAccumulator *accum = (BuildAccumulator *) arg;
+
+ return ginCompareAttEntries(accum->ginstate,
+ ea->attnum, ea->key, ea->category,
+ eb->attnum, eb->key, eb->category);
+}
+
+/* Allocator function for rbtree.c */
+static RBNode *
+ginAllocEntryAccumulator(void *arg)
{
- EntryAccumulator *ea = (EntryAccumulator *) a;
- EntryAccumulator *eb = (EntryAccumulator *) b;
BuildAccumulator *accum = (BuildAccumulator *) arg;
+ GinEntryAccumulator *ea;
+
+ /*
+ * Allocate memory by rather big chunks to decrease overhead. We have no
+ * need to reclaim RBNodes individually, so this costs nothing.
+ */
+ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY)
+ {
+ accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY);
+ accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
+ accum->eas_used = 0;
+ }
+
+ /* Allocate new RBNode from current chunk */
+ ea = accum->entryallocator + accum->eas_used;
+ accum->eas_used++;
- return compareAttEntries(accum->ginstate, ea->attnum, ea->value,
- eb->attnum, eb->value);
+ return (RBNode *) ea;
}
void
ginInitBA(BuildAccumulator *accum)
{
+ /* accum->ginstate is intentionally not set here */
accum->allocatedMemory = 0;
accum->entryallocator = NULL;
- accum->tree = rb_create(cmpEntryAccumulator, ginAppendData, NULL, accum);
- accum->iterator = NULL;
- accum->tmpList = NULL;
+ accum->eas_used = 0;
+ accum->tree = rb_create(sizeof(GinEntryAccumulator),
+ cmpEntryAccumulator,
+ ginCombineData,
+ ginAllocEntryAccumulator,
+ NULL, /* no freefunc needed */
+ (void *) accum);
}
/*
- * This is basically the same as datumCopy(), but modified to count
- * palloc'd space in accum.
+ * This is basically the same as datumCopy(), but extended to count
+ * palloc'd space in accum->allocatedMemory.
*/
static Datum
getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value)
@@ -102,62 +136,53 @@ getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value)
* Find/store one entry from indexed value.
*/
static void
-ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry)
+ginInsertBAEntry(BuildAccumulator *accum,
+ ItemPointer heapptr, OffsetNumber attnum,
+ Datum key, GinNullCategory category)
{
- EntryAccumulator *key,
- *ea;
+ GinEntryAccumulator eatmp;
+ GinEntryAccumulator *ea;
+ bool isNew;
/*
- * Allocate memory by rather big chunk to decrease overhead, we don't keep
- * pointer to previously allocated chunks because they will free by
- * MemoryContextReset() call.
+ * For the moment, fill only the fields of eatmp that will be looked at by
+ * cmpEntryAccumulator or ginCombineData.
*/
- if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY)
- {
- accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY);
- accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
- accum->length = 0;
- }
-
- /* "Allocate" new key in chunk */
- key = accum->entryallocator + accum->length;
- accum->length++;
-
- key->attnum = attnum;
- key->value = entry;
- /* To prevent multiple palloc/pfree cycles, we reuse array */
- if (accum->tmpList == NULL)
- accum->tmpList =
- (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
- key->list = accum->tmpList;
- key->list[0] = *heapptr;
+ eatmp.attnum = attnum;
+ eatmp.key = key;
+ eatmp.category = category;
+ /* temporarily set up single-entry itempointer list */
+ eatmp.list = heapptr;
- ea = rb_insert(accum->tree, key);
+ ea = (GinEntryAccumulator *) rb_insert(accum->tree, (RBNode *) &eatmp,
+ &isNew);
- if (ea == NULL)
+ if (isNew)
{
/*
- * The key has been inserted, so continue initialization.
+ * Finish initializing new tree entry, including making permanent
+ * copies of the datum (if it's not null) and itempointer.
*/
- key->value = getDatumCopy(accum, attnum, entry);
- key->length = DEF_NPTR;
- key->number = 1;
- key->shouldSort = FALSE;
- accum->allocatedMemory += GetMemoryChunkSpace(key->list);
- accum->tmpList = NULL;
+ if (category == GIN_CAT_NORM_KEY)
+ ea->key = getDatumCopy(accum, attnum, key);
+ ea->maxcount = DEF_NPTR;
+ ea->count = 1;
+ ea->shouldSort = FALSE;
+ ea->list =
+ (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
+ ea->list[0] = *heapptr;
+ accum->allocatedMemory += GetMemoryChunkSpace(ea->list);
}
else
{
/*
- * The key has been appended, so "free" allocated key by decrementing
- * chunk's counter.
+ * ginCombineData did everything needed.
*/
- accum->length--;
}
}
/*
- * Insert one heap pointer.
+ * Insert the entries for one heap pointer.
*
* Since the entries are being inserted into a balanced binary tree, you
* might think that the order of insertion wouldn't be critical, but it turns
@@ -169,22 +194,24 @@ ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum
* We do this as follows. First, we imagine that we have an array whose size
* is the smallest power of two greater than or equal to the actual array
* size. Second, we insert the middle entry of our virtual array into the
- * tree; then, we insert the middles of each half of out virtual array, then
+ * tree; then, we insert the middles of each half of our virtual array, then
* middles of quarters, etc.
*/
void
-ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum,
- Datum *entries, int32 nentry)
+ginInsertBAEntries(BuildAccumulator *accum,
+ ItemPointer heapptr, OffsetNumber attnum,
+ Datum *entries, GinNullCategory *categories,
+ int32 nentries)
{
- uint32 step = nentry;
+ uint32 step = nentries;
- if (nentry <= 0)
+ if (nentries <= 0)
return;
Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber);
/*
- * step will contain largest power of 2 and <= nentry
+ * step will contain largest power of 2 and <= nentries
*/
step |= (step >> 1);
step |= (step >> 2);
@@ -198,8 +225,9 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att
{
int i;
- for (i = step - 1; i < nentry && i >= 0; i += step << 1 /* *2 */ )
- ginInsertEntry(accum, heapptr, attnum, entries[i]);
+ for (i = step - 1; i < nentries && i >= 0; i += step << 1 /* *2 */ )
+ ginInsertBAEntry(accum, heapptr, attnum,
+ entries[i], categories[i]);
step >>= 1; /* /2 */
}
@@ -208,35 +236,49 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att
static int
qsortCompareItemPointers(const void *a, const void *b)
{
- int res = compareItemPointers((ItemPointer) a, (ItemPointer) b);
+ int res = ginCompareItemPointers((ItemPointer) a, (ItemPointer) b);
+ /* Assert that there are no equal item pointers being sorted */
Assert(res != 0);
return res;
}
+/* Prepare to read out the rbtree contents using ginGetBAEntry */
+void
+ginBeginBAScan(BuildAccumulator *accum)
+{
+ rb_begin_iterate(accum->tree, LeftRightWalk);
+}
+
+/*
+ * Get the next entry in sequence from the BuildAccumulator's rbtree.
+ * This consists of a single key datum and a list (array) of one or more
+ * heap TIDs in which that key is found. The list is guaranteed sorted.
+ */
ItemPointerData *
-ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *value, uint32 *n)
+ginGetBAEntry(BuildAccumulator *accum,
+ OffsetNumber *attnum, Datum *key, GinNullCategory *category,
+ uint32 *n)
{
- EntryAccumulator *entry;
+ GinEntryAccumulator *entry;
ItemPointerData *list;
- if (accum->iterator == NULL)
- accum->iterator = rb_begin_iterate(accum->tree, LeftRightWalk);
-
- entry = rb_iterate(accum->iterator);
+ entry = (GinEntryAccumulator *) rb_iterate(accum->tree);
if (entry == NULL)
- return NULL;
+ return NULL; /* no more entries */
- *n = entry->number;
*attnum = entry->attnum;
- *value = entry->value;
+ *key = entry->key;
+ *category = entry->category;
list = entry->list;
+ *n = entry->count;
- Assert(list != NULL);
+ Assert(list != NULL && entry->count > 0);
- if (entry->shouldSort && entry->number > 1)
- qsort(list, *n, sizeof(ItemPointerData), qsortCompareItemPointers);
+ if (entry->shouldSort && entry->count > 1)
+ qsort(list, entry->count, sizeof(ItemPointerData),
+ qsortCompareItemPointers);
return list;
}
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index b456e167ce..41dbe9fd11 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -4,31 +4,37 @@
* page utilities routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/gindatapage.c,v 1.17 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/gin/gindatapage.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "storage/bufmgr.h"
#include "utils/rel.h"
int
-compareItemPointers(ItemPointer a, ItemPointer b)
+ginCompareItemPointers(ItemPointer a, ItemPointer b)
{
- if (GinItemPointerGetBlockNumber(a) == GinItemPointerGetBlockNumber(b))
+ BlockNumber ba = GinItemPointerGetBlockNumber(a);
+ BlockNumber bb = GinItemPointerGetBlockNumber(b);
+
+ if (ba == bb)
{
- if (GinItemPointerGetOffsetNumber(a) == GinItemPointerGetOffsetNumber(b))
+ OffsetNumber oa = GinItemPointerGetOffsetNumber(a);
+ OffsetNumber ob = GinItemPointerGetOffsetNumber(b);
+
+ if (oa == ob)
return 0;
- return (GinItemPointerGetOffsetNumber(a) > GinItemPointerGetOffsetNumber(b)) ? 1 : -1;
+ return (oa > ob) ? 1 : -1;
}
- return (GinItemPointerGetBlockNumber(a) > GinItemPointerGetBlockNumber(b)) ? 1 : -1;
+ return (ba > bb) ? 1 : -1;
}
/*
@@ -37,9 +43,9 @@ compareItemPointers(ItemPointer a, ItemPointer b)
* Caller is responsible that there is enough space at *dst.
*/
uint32
-MergeItemPointers(ItemPointerData *dst,
- ItemPointerData *a, uint32 na,
- ItemPointerData *b, uint32 nb)
+ginMergeItemPointers(ItemPointerData *dst,
+ ItemPointerData *a, uint32 na,
+ ItemPointerData *b, uint32 nb)
{
ItemPointerData *dptr = dst;
ItemPointerData *aptr = a,
@@ -47,7 +53,7 @@ MergeItemPointers(ItemPointerData *dst,
while (aptr - a < na && bptr - b < nb)
{
- int cmp = compareItemPointers(aptr, bptr);
+ int cmp = ginCompareItemPointers(aptr, bptr);
if (cmp > 0)
*dptr++ = *bptr++;
@@ -82,7 +88,7 @@ dataIsMoveRight(GinBtree btree, Page page)
if (GinPageRightMost(page))
return FALSE;
- return (compareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE;
+ return (ginCompareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE;
}
/*
@@ -122,16 +128,17 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
pitem = (PostingItem *) GinDataPageGetItem(page, mid);
if (mid == maxoff)
-
+ {
/*
* Right infinity, page already correctly chosen with a help of
* dataIsMoveRight
*/
result = -1;
+ }
else
{
pitem = (PostingItem *) GinDataPageGetItem(page, mid);
- result = compareItemPointers(btree->items + btree->curitem, &(pitem->key));
+ result = ginCompareItemPointers(btree->items + btree->curitem, &(pitem->key));
}
if (result == 0)
@@ -189,7 +196,7 @@ dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack)
{
OffsetNumber mid = low + ((high - low) / 2);
- result = compareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid));
+ result = ginCompareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid));
if (result == 0)
{
@@ -220,7 +227,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
Assert(!GinPageIsLeaf(page));
Assert(GinPageIsData(page));
- /* if page isn't changed, we returns storedOff */
+ /* if page isn't changed, we return storedOff */
if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
{
pitem = (PostingItem *) GinDataPageGetItem(page, storedOff);
@@ -286,9 +293,11 @@ GinDataPageAddItem(Page page, void *data, OffsetNumber offset)
{
ptr = GinDataPageGetItem(page, offset);
if (maxoff + 1 - offset != 0)
- memmove(ptr + GinSizeOfItem(page), ptr, (maxoff - offset + 1) * GinSizeOfItem(page));
+ memmove(ptr + GinSizeOfDataPageItem(page),
+ ptr,
+ (maxoff - offset + 1) * GinSizeOfDataPageItem(page));
}
- memcpy(ptr, data, GinSizeOfItem(page));
+ memcpy(ptr, data, GinSizeOfDataPageItem(page));
GinPageGetOpaque(page)->maxoff++;
}
@@ -297,7 +306,7 @@ GinDataPageAddItem(Page page, void *data, OffsetNumber offset)
* Deletes posting item from non-leaf page
*/
void
-PageDeletePostingItem(Page page, OffsetNumber offset)
+GinPageDeletePostingItem(Page page, OffsetNumber offset)
{
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
@@ -372,10 +381,12 @@ static void
dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
{
Page page = BufferGetPage(buf);
+ int sizeofitem = GinSizeOfDataPageItem(page);
+ int cnt = 0;
+
+ /* these must be static so they can be returned to caller */
static XLogRecData rdata[3];
- int sizeofitem = GinSizeOfItem(page);
static ginxlogInsert data;
- int cnt = 0;
*prdata = rdata;
Assert(GinPageIsData(page));
@@ -453,21 +464,23 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda
static Page
dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
{
- static ginxlogSplit data;
- static XLogRecData rdata[4];
- static char vector[2 * BLCKSZ];
char *ptr;
OffsetNumber separator;
ItemPointer bound;
Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
ItemPointerData oldbound = *GinDataPageGetRightBound(lpage);
- int sizeofitem = GinSizeOfItem(lpage);
+ int sizeofitem = GinSizeOfDataPageItem(lpage);
OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff;
Page rpage = BufferGetPage(rbuf);
Size pageSize = PageGetPageSize(lpage);
Size freeSpace;
uint32 nCopied = 1;
+ /* these must be static so they can be returned to caller */
+ static ginxlogSplit data;
+ static XLogRecData rdata[4];
+ static char vector[2 * BLCKSZ];
+
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
freeSpace = GinDataPageGetFreeSpace(rpage);
@@ -482,9 +495,11 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe
if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
{
nCopied = 0;
- while (btree->curitem < btree->nitem && maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
+ while (btree->curitem < btree->nitem &&
+ maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
{
- memcpy(vector + maxoff * sizeof(ItemPointerData), btree->items + btree->curitem,
+ memcpy(vector + maxoff * sizeof(ItemPointerData),
+ btree->items + btree->curitem,
sizeof(ItemPointerData));
maxoff++;
nCopied++;
@@ -571,7 +586,7 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe
* Also called from ginxlog, should not use btree
*/
void
-dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
{
Page page = BufferGetPage(root),
lpage = BufferGetPage(lbuf),
@@ -589,20 +604,23 @@ dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
}
void
-prepareDataScan(GinBtree btree, Relation index)
+ginPrepareDataScan(GinBtree btree, Relation index)
{
memset(btree, 0, sizeof(GinBtreeData));
+
btree->index = index;
- btree->isMoveRight = dataIsMoveRight;
+
btree->findChildPage = dataLocateItem;
+ btree->isMoveRight = dataIsMoveRight;
btree->findItem = dataLocateLeafItem;
btree->findChildPtr = dataFindChildPtr;
btree->getLeftMostPage = dataGetLeftMostPage;
btree->isEnoughSpace = dataIsEnoughSpace;
btree->placeToPage = dataPlaceToPage;
btree->splitPage = dataSplitPage;
- btree->fillRoot = dataFillRoot;
+ btree->fillRoot = ginDataFillRoot;
+ btree->isData = TRUE;
btree->searchMode = FALSE;
btree->isDelete = FALSE;
btree->fullScan = FALSE;
@@ -610,11 +628,11 @@ prepareDataScan(GinBtree btree, Relation index)
}
GinPostingTreeScan *
-prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
+ginPrepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
{
GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan));
- prepareDataScan(&gdi->btree, index);
+ ginPrepareDataScan(&gdi->btree, index);
gdi->btree.searchMode = searchMode;
gdi->btree.fullScan = searchMode;
@@ -628,7 +646,9 @@ prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
* Inserts array of item pointers, may execute several tree scan (very rare)
*/
void
-insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem)
+ginInsertItemPointers(GinPostingTreeScan *gdi,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats)
{
BlockNumber rootBlkno = gdi->stack->blkno;
@@ -653,14 +673,14 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem)
freeGinBtreeStack(gdi->stack);
}
else
- ginInsertValue(&(gdi->btree), gdi->stack);
+ ginInsertValue(&(gdi->btree), gdi->stack, buildStats);
gdi->stack = NULL;
}
}
Buffer
-scanBeginPostingTree(GinPostingTreeScan *gdi)
+ginScanBeginPostingTree(GinPostingTreeScan *gdi)
{
gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
return gdi->stack->buffer;
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index 6d307c8d59..fa134f9fc3 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -4,17 +4,17 @@
* page utilities routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.24 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gin/ginentrypage.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "storage/bufmgr.h"
#include "utils/rel.h"
@@ -24,107 +24,116 @@
* If the tuple would be too big to be stored, function throws a suitable
* error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE.
*
- * On leaf pages, Index tuple has non-traditional layout. Tuple may contain
- * posting list or root blocknumber of posting tree.
- * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno)
- * 1) Posting list
- * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual
- * - ItemPointerGetBlockNumber(&itup->t_tid) contains original
- * size of tuple (without posting list).
- * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n)
- * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number
- * of elements in posting list (number of heap itempointers)
- * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n)
- * - After standard part of tuple there is a posting list, ie, array
- * of heap itempointers
- * Macros: GinGetPosting(itup)
- * 2) Posting tree
- * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual
- * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of
- * root of posting tree
- * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number
- * GIN_TREE_POSTING, which distinguishes this from posting-list case
- *
- * Attributes of an index tuple are different for single and multicolumn index.
- * For single-column case, index tuple stores only value to be indexed.
- * For multicolumn case, it stores two attributes: column number of value
- * and value.
+ * See src/backend/access/gin/README for a description of the index tuple
+ * format that is being built here. We build on the assumption that we
+ * are making a leaf-level key entry containing a posting list of nipd items.
+ * If the caller is actually trying to make a posting-tree entry, non-leaf
+ * entry, or pending-list entry, it should pass nipd = 0 and then overwrite
+ * the t_tid fields as necessary. In any case, ipd can be NULL to skip
+ * copying any itempointers into the posting list; the caller is responsible
+ * for filling the posting list afterwards, if ipd = NULL and nipd > 0.
*/
IndexTuple
-GinFormTuple(Relation index, GinState *ginstate,
- OffsetNumber attnum, Datum key,
- ItemPointerData *ipd, uint32 nipd, bool errorTooBig)
+GinFormTuple(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ ItemPointerData *ipd, uint32 nipd,
+ bool errorTooBig)
{
- bool isnull[2] = {FALSE, FALSE};
+ Datum datums[2];
+ bool isnull[2];
IndexTuple itup;
uint32 newsize;
+ /* Build the basic tuple: optional column number, plus key datum */
if (ginstate->oneCol)
- itup = index_form_tuple(ginstate->origTupdesc, &key, isnull);
+ {
+ datums[0] = key;
+ isnull[0] = (category != GIN_CAT_NORM_KEY);
+ }
else
{
- Datum datums[2];
-
datums[0] = UInt16GetDatum(attnum);
+ isnull[0] = false;
datums[1] = key;
- itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
+ isnull[1] = (category != GIN_CAT_NORM_KEY);
}
- GinSetOrigSizePosting(itup, IndexTupleSize(itup));
+ itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
+
+ /*
+ * Determine and store offset to the posting list, making sure there is
+ * room for the category byte if needed.
+ *
+ * Note: because index_form_tuple MAXALIGNs the tuple size, there may well
+ * be some wasted pad space. Is it worth recomputing the data length to
+ * prevent that? That would also allow us to Assert that the real data
+ * doesn't overlap the GinNullCategory byte, which this code currently
+ * takes on faith.
+ */
+ newsize = IndexTupleSize(itup);
- if (nipd > 0)
+ if (IndexTupleHasNulls(itup))
{
- newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd);
- if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
- {
- if (errorTooBig)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
- (unsigned long) newsize,
- (unsigned long) Min(INDEX_SIZE_MASK,
- GinMaxItemSize),
- RelationGetRelationName(index))));
- return NULL;
- }
+ uint32 minsize;
+
+ Assert(category != GIN_CAT_NORM_KEY);
+ minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory);
+ newsize = Max(newsize, minsize);
+ }
+
+ newsize = SHORTALIGN(newsize);
+
+ GinSetPostingOffset(itup, newsize);
+ GinSetNPosting(itup, nipd);
+
+ /*
+ * Add space needed for posting list, if any. Then check that the tuple
+ * won't be too big to store.
+ */
+ newsize += sizeof(ItemPointerData) * nipd;
+ newsize = MAXALIGN(newsize);
+ if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
+ {
+ if (errorTooBig)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
+ (unsigned long) newsize,
+ (unsigned long) Min(INDEX_SIZE_MASK,
+ GinMaxItemSize),
+ RelationGetRelationName(ginstate->index))));
+ pfree(itup);
+ return NULL;
+ }
+
+ /*
+ * Resize tuple if needed
+ */
+ if (newsize != IndexTupleSize(itup))
+ {
itup = repalloc(itup, newsize);
- /* set new size */
+ /* set new size in tuple header */
itup->t_info &= ~INDEX_SIZE_MASK;
itup->t_info |= newsize;
-
- if (ipd)
- memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
- GinSetNPosting(itup, nipd);
}
- else
- {
- /*
- * Gin tuple without any ItemPointers should be large enough to keep
- * one ItemPointer, to prevent inconsistency between
- * ginHeapTupleFastCollect and ginEntryInsert called by
- * ginHeapTupleInsert. ginHeapTupleFastCollect forms tuple without
- * extra pointer to heap, but ginEntryInsert (called for pending list
- * cleanup during vacuum) will form the same tuple with one
- * ItemPointer.
- */
- newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData));
- if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
- {
- if (errorTooBig)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
- (unsigned long) newsize,
- (unsigned long) Min(INDEX_SIZE_MASK,
- GinMaxItemSize),
- RelationGetRelationName(index))));
- return NULL;
- }
- GinSetNPosting(itup, 0);
+ /*
+ * Insert category byte, if needed
+ */
+ if (category != GIN_CAT_NORM_KEY)
+ {
+ Assert(IndexTupleHasNulls(itup));
+ GinSetNullCategory(itup, ginstate, category);
}
+
+ /*
+ * Copy in the posting list, if provided
+ */
+ if (ipd)
+ memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
+
return itup;
}
@@ -140,7 +149,8 @@ GinShortenTuple(IndexTuple itup, uint32 nipd)
Assert(nipd <= GinGetNPosting(itup));
- newsize = MAXALIGN(SHORTALIGN(GinGetOrigSizePosting(itup)) + sizeof(ItemPointerData) * nipd);
+ newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
+ newsize = MAXALIGN(newsize);
Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
@@ -151,8 +161,45 @@ GinShortenTuple(IndexTuple itup, uint32 nipd)
}
/*
+ * Form a non-leaf entry tuple by copying the key data from the given tuple,
+ * which can be either a leaf or non-leaf entry tuple.
+ *
+ * Any posting list in the source tuple is not copied. The specified child
+ * block number is inserted into t_tid.
+ */
+static IndexTuple
+GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk)
+{
+ IndexTuple nitup;
+
+ if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
+ {
+ /* Tuple contains a posting list, just copy stuff before that */
+ uint32 origsize = GinGetPostingOffset(itup);
+
+ origsize = MAXALIGN(origsize);
+ nitup = (IndexTuple) palloc(origsize);
+ memcpy(nitup, itup, origsize);
+ /* ... be sure to fix the size header field ... */
+ nitup->t_info &= ~INDEX_SIZE_MASK;
+ nitup->t_info |= origsize;
+ }
+ else
+ {
+ /* Copy the tuple as-is */
+ nitup = (IndexTuple) palloc(IndexTupleSize(itup));
+ memcpy(nitup, itup, IndexTupleSize(itup));
+ }
+
+ /* Now insert the correct downlink */
+ GinSetDownlink(nitup, childblk);
+
+ return nitup;
+}
+
+/*
* Entry tree is a "static", ie tuple never deletes from it,
- * so we don't use right bound, we use rightest key instead.
+ * so we don't use right bound, we use rightmost key instead.
*/
static IndexTuple
getRightMostTuple(Page page)
@@ -166,16 +213,20 @@ static bool
entryIsMoveRight(GinBtree btree, Page page)
{
IndexTuple itup;
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
if (GinPageRightMost(page))
return FALSE;
itup = getRightMostTuple(page);
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
- if (compareAttEntries(btree->ginstate,
- btree->entryAttnum, btree->entryValue,
- gintuple_get_attrnum(btree->ginstate, itup),
- gin_index_getattr(btree->ginstate, itup)) > 0)
+ if (ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum, btree->entryKey, btree->entryCategory,
+ attnum, key, category) > 0)
return TRUE;
return FALSE;
@@ -183,7 +234,7 @@ entryIsMoveRight(GinBtree btree, Page page)
/*
* Find correct tuple in non-leaf page. It supposed that
- * page correctly choosen and searching value SHOULD be on page
+ * page correctly chosen and searching value SHOULD be on page
*/
static BlockNumber
entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
@@ -216,22 +267,31 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
OffsetNumber mid = low + ((high - low) / 2);
if (mid == maxoff && GinPageRightMost(page))
+ {
/* Right infinity */
result = -1;
+ }
else
{
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
- result = compareAttEntries(btree->ginstate,
- btree->entryAttnum, btree->entryValue,
- gintuple_get_attrnum(btree->ginstate, itup),
- gin_index_getattr(btree->ginstate, itup));
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
+ result = ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum,
+ btree->entryKey,
+ btree->entryCategory,
+ attnum, key, category);
}
if (result == 0)
{
stack->off = mid;
- Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
- return GinItemPointerGetBlockNumber(&(itup)->t_tid);
+ Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
+ return GinGetDownlink(itup);
}
else if (result > 0)
low = mid + 1;
@@ -243,13 +303,13 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
stack->off = high;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
- Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
- return GinItemPointerGetBlockNumber(&(itup)->t_tid);
+ Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
+ return GinGetDownlink(itup);
}
/*
* Searches correct position for value on leaf page.
- * Page should be corrrectly choosen.
+ * Page should be correctly chosen.
* Returns true if value found on page.
*/
static bool
@@ -258,7 +318,6 @@ entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
Page page = BufferGetPage(stack->buffer);
OffsetNumber low,
high;
- IndexTuple itup;
Assert(GinPageIsLeaf(page));
Assert(!GinPageIsData(page));
@@ -283,13 +342,20 @@ entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
+ IndexTuple itup;
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
int result;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
- result = compareAttEntries(btree->ginstate,
- btree->entryAttnum, btree->entryValue,
- gintuple_get_attrnum(btree->ginstate, itup),
- gin_index_getattr(btree->ginstate, itup));
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
+ result = ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum,
+ btree->entryKey,
+ btree->entryCategory,
+ attnum, key, category);
if (result == 0)
{
stack->off = mid;
@@ -319,7 +385,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto
if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
- if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
+ if (GinGetDownlink(itup) == blkno)
return storedOff;
/*
@@ -329,7 +395,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto
for (i = storedOff + 1; i <= maxoff; i++)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
- if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
+ if (GinGetDownlink(itup) == blkno)
return i;
}
maxoff = storedOff - 1;
@@ -339,7 +405,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
- if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
+ if (GinGetDownlink(itup) == blkno)
return i;
}
@@ -356,7 +422,7 @@ entryGetLeftMostPage(GinBtree btree, Page page)
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
- return GinItemPointerGetBlockNumber(&(itup)->t_tid);
+ return GinGetDownlink(itup);
}
static bool
@@ -404,7 +470,7 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
- ItemPointerSet(&itup->t_tid, btree->rightblkno, InvalidOffsetNumber);
+ GinSetDownlink(itup, btree->rightblkno);
ret = btree->rightblkno;
}
@@ -420,11 +486,13 @@ static void
entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
{
Page page = BufferGetPage(buf);
- static XLogRecData rdata[3];
OffsetNumber placed;
- static ginxlogInsert data;
int cnt = 0;
+ /* these must be static so they can be returned to caller */
+ static XLogRecData rdata[3];
+ static ginxlogInsert data;
+
*prdata = rdata;
data.updateBlkno = entryPreparePage(btree, page, off);
@@ -473,31 +541,6 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
}
/*
- * Returns new tuple with copied value from source tuple.
- * New tuple will not store posting list
- */
-static IndexTuple
-copyIndexTuple(IndexTuple itup, Page page)
-{
- IndexTuple nitup;
-
- if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
- {
- nitup = (IndexTuple) palloc(MAXALIGN(GinGetOrigSizePosting(itup)));
- memcpy(nitup, itup, GinGetOrigSizePosting(itup));
- nitup->t_info &= ~INDEX_SIZE_MASK;
- nitup->t_info |= GinGetOrigSizePosting(itup);
- }
- else
- {
- nitup = (IndexTuple) palloc(MAXALIGN(IndexTupleSize(itup)));
- memcpy(nitup, itup, IndexTupleSize(itup));
- }
-
- return nitup;
-}
-
-/*
* Place tuple and split page, original buffer(lbuf) leaves untouched,
* returns shadow page of lbuf filled new data.
* Tuples are distributed between pages by equal size on its, not
@@ -506,26 +549,28 @@ copyIndexTuple(IndexTuple itup, Page page)
static Page
entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
{
- static XLogRecData rdata[2];
OffsetNumber i,
maxoff,
separator = InvalidOffsetNumber;
Size totalsize = 0;
Size lsize = 0,
size;
- static char tupstore[2 * BLCKSZ];
char *ptr;
IndexTuple itup,
leftrightmost = NULL;
- static ginxlogSplit data;
Page page;
Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
Page rpage = BufferGetPage(rbuf);
Size pageSize = PageGetPageSize(lpage);
+ /* these must be static so they can be returned to caller */
+ static XLogRecData rdata[2];
+ static ginxlogSplit data;
+ static char tupstore[2 * BLCKSZ];
+
*prdata = rdata;
data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
- InvalidOffsetNumber : GinItemPointerGetBlockNumber(&(btree->entry->t_tid));
+ InvalidOffsetNumber : GinGetDownlink(btree->entry);
data.updateBlkno = entryPreparePage(btree, lpage, off);
maxoff = PageGetMaxOffsetNumber(lpage);
@@ -586,8 +631,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
ptr += MAXALIGN(IndexTupleSize(itup));
}
- btree->entry = copyIndexTuple(leftrightmost, lpage);
- ItemPointerSet(&(btree->entry)->t_tid, BufferGetBlockNumber(lbuf), InvalidOffsetNumber);
+ btree->entry = GinFormInteriorTuple(leftrightmost, lpage,
+ BufferGetBlockNumber(lbuf));
btree->rightblkno = BufferGetBlockNumber(rbuf);
@@ -615,7 +660,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
}
/*
- * return newly allocate rightmost tuple
+ * return newly allocated rightmost tuple
*/
IndexTuple
ginPageGetLinkItup(Buffer buf)
@@ -625,8 +670,7 @@ ginPageGetLinkItup(Buffer buf)
Page page = BufferGetPage(buf);
itup = getRightMostTuple(page);
- nitup = copyIndexTuple(itup, page);
- ItemPointerSet(&nitup->t_tid, BufferGetBlockNumber(buf), InvalidOffsetNumber);
+ nitup = GinFormInteriorTuple(itup, page, BufferGetBlockNumber(buf));
return nitup;
}
@@ -636,7 +680,7 @@ ginPageGetLinkItup(Buffer buf)
* Also called from ginxlog, should not use btree
*/
void
-entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
{
Page page;
IndexTuple itup;
@@ -646,34 +690,47 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
itup = ginPageGetLinkItup(lbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
itup = ginPageGetLinkItup(rbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
}
+/*
+ * Set up GinBtree for entry page access
+ *
+ * Note: during WAL recovery, there may be no valid data in ginstate
+ * other than a faked-up Relation pointer; the key datum is bogus too.
+ */
void
-prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum value, GinState *ginstate)
+ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
+ Datum key, GinNullCategory category,
+ GinState *ginstate)
{
memset(btree, 0, sizeof(GinBtreeData));
- btree->isMoveRight = entryIsMoveRight;
+ btree->index = ginstate->index;
+ btree->ginstate = ginstate;
+
btree->findChildPage = entryLocateEntry;
+ btree->isMoveRight = entryIsMoveRight;
btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr;
btree->getLeftMostPage = entryGetLeftMostPage;
btree->isEnoughSpace = entryIsEnoughSpace;
btree->placeToPage = entryPlaceToPage;
btree->splitPage = entrySplitPage;
- btree->fillRoot = entryFillRoot;
+ btree->fillRoot = ginEntryFillRoot;
- btree->index = index;
- btree->ginstate = ginstate;
- btree->entryAttnum = attnum;
- btree->entryValue = value;
-
- btree->isDelete = FALSE;
+ btree->isData = FALSE;
btree->searchMode = FALSE;
btree->fullScan = FALSE;
btree->isBuild = FALSE;
+
+ btree->entryAttnum = attnum;
+ btree->entryKey = key;
+ btree->entryCategory = category;
+ btree->isDelete = FALSE;
}
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index f8e0b5ad40..82419e37ac 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -7,19 +7,18 @@
* transfer pending entries into the regular index structure. This
* wins because bulk insertion is much more efficient than retail.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginfast.c,v 1.7 2010/02/11 14:29:50 teodor Exp $
+ * src/backend/access/gin/ginfast.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/genam.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
@@ -30,12 +29,13 @@
#define GIN_PAGE_FREESIZE \
( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) )
-typedef struct DatumArray
+typedef struct KeyArray
{
- Datum *values; /* expansible array */
+ Datum *keys; /* expansible array */
+ GinNullCategory *categories; /* another expansible array */
int32 nvalues; /* current number of valid entries */
- int32 maxvalues; /* allocated size of array */
-} DatumArray;
+ int32 maxvalues; /* allocated size of arrays */
+} KeyArray;
/*
@@ -88,8 +88,9 @@ writeListPage(Relation index, Buffer buffer,
GinPageGetOpaque(page)->rightlink = rightlink;
/*
- * tail page may contain only the whole row(s) or final part of row placed
- * on previous pages
+ * tail page may contain only whole row(s) or final part of row placed on
+ * previous pages (a "row" here meaning all the index tuples generated for
+ * one heap tuple)
*/
if (rightlink == InvalidBlockNumber)
{
@@ -103,7 +104,7 @@ writeListPage(Relation index, Buffer buffer,
MarkBufferDirty(buffer);
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecData rdata[2];
ginxlogInsertListPage data;
@@ -210,13 +211,16 @@ makeSublist(Relation index, IndexTuple *tuples, int32 ntuples,
}
/*
- * Inserts collected values during normal insertion. Function guarantees
- * that all values of heap will be stored sequentially, preserving order
+ * Write the index tuples contained in *collector into the index's
+ * pending list.
+ *
+ * Function guarantees that all these tuples will be inserted consecutively,
+ * preserving order
*/
void
-ginHeapTupleFastInsert(Relation index, GinState *ginstate,
- GinTupleCollector *collector)
+ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
{
+ Relation index = ginstate->index;
Buffer metabuffer;
Page metapage;
GinMetaPageData *metadata = NULL;
@@ -291,7 +295,12 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate,
*/
START_CRIT_SECTION();
- memcpy(metadata, &sublist, sizeof(GinMetaPageData));
+ metadata->head = sublist.head;
+ metadata->tail = sublist.tail;
+ metadata->tailFreeSize = sublist.tailFreeSize;
+
+ metadata->nPendingPages = sublist.nPendingPages;
+ metadata->nPendingHeapTuples = sublist.nPendingHeapTuples;
}
else
{
@@ -384,7 +393,7 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate,
*/
MarkBufferDirty(metabuffer);
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
@@ -421,34 +430,40 @@ ginHeapTupleFastInsert(Relation index, GinState *ginstate,
END_CRIT_SECTION();
if (needCleanup)
- ginInsertCleanup(index, ginstate, false, NULL);
+ ginInsertCleanup(ginstate, false, NULL);
}
/*
- * Collect values from one tuples to be indexed. All values for
- * one tuples should be written at once - to guarantee consistent state
+ * Create temporary index tuples for a single indexable item (one index column
+ * for the heap tuple specified by ht_ctid), and append them to the array
+ * in *collector. They will subsequently be written out using
+ * ginHeapTupleFastInsert. Note that to guarantee consistent state, all
+ * temp tuples for a given heap tuple must be written in one call to
+ * ginHeapTupleFastInsert.
*/
-uint32
-ginHeapTupleFastCollect(Relation index, GinState *ginstate,
+void
+ginHeapTupleFastCollect(GinState *ginstate,
GinTupleCollector *collector,
- OffsetNumber attnum, Datum value, ItemPointer item)
+ OffsetNumber attnum, Datum value, bool isNull,
+ ItemPointer ht_ctid)
{
Datum *entries;
+ GinNullCategory *categories;
int32 i,
nentries;
- entries = extractEntriesSU(ginstate, attnum, value, &nentries);
-
- if (nentries == 0)
- /* nothing to insert */
- return 0;
+ /*
+ * Extract the key values that need to be inserted in the index
+ */
+ entries = ginExtractEntries(ginstate, attnum, value, isNull,
+ &nentries, &categories);
/*
* Allocate/reallocate memory for storing collected tuples
*/
if (collector->tuples == NULL)
{
- collector->lentuples = nentries * index->rd_att->natts;
+ collector->lentuples = nentries * ginstate->origTupdesc->natts;
collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples);
}
@@ -460,19 +475,19 @@ ginHeapTupleFastCollect(Relation index, GinState *ginstate,
}
/*
- * Creates tuple's array
+ * Build an index tuple for each key value, and add to array. In pending
+ * tuples we just stick the heap TID into t_tid.
*/
for (i = 0; i < nentries; i++)
{
- collector->tuples[collector->ntuples + i] =
- GinFormTuple(index, ginstate, attnum, entries[i], NULL, 0, true);
- collector->tuples[collector->ntuples + i]->t_tid = *item;
- collector->sumsize += IndexTupleSize(collector->tuples[collector->ntuples + i]);
- }
+ IndexTuple itup;
- collector->ntuples += nentries;
-
- return nentries;
+ itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
+ NULL, 0, true);
+ itup->t_tid = *ht_ctid;
+ collector->tuples[collector->ntuples++] = itup;
+ collector->sumsize += IndexTupleSize(itup);
+ }
}
/*
@@ -564,7 +579,7 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
MarkBufferDirty(buffers[i]);
}
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
@@ -591,38 +606,55 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
return false;
}
-/* Add datum to DatumArray, resizing if needed */
+/* Initialize empty KeyArray */
static void
-addDatum(DatumArray *datums, Datum datum)
+initKeyArray(KeyArray *keys, int32 maxvalues)
{
- if (datums->nvalues >= datums->maxvalues)
+ keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues);
+ keys->categories = (GinNullCategory *)
+ palloc(sizeof(GinNullCategory) * maxvalues);
+ keys->nvalues = 0;
+ keys->maxvalues = maxvalues;
+}
+
+/* Add datum to KeyArray, resizing if needed */
+static void
+addDatum(KeyArray *keys, Datum datum, GinNullCategory category)
+{
+ if (keys->nvalues >= keys->maxvalues)
{
- datums->maxvalues *= 2;
- datums->values = (Datum *) repalloc(datums->values,
- sizeof(Datum) * datums->maxvalues);
+ keys->maxvalues *= 2;
+ keys->keys = (Datum *)
+ repalloc(keys->keys, sizeof(Datum) * keys->maxvalues);
+ keys->categories = (GinNullCategory *)
+ repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues);
}
- datums->values[datums->nvalues++] = datum;
+ keys->keys[keys->nvalues] = datum;
+ keys->categories[keys->nvalues] = category;
+ keys->nvalues++;
}
/*
- * Go through all tuples >= startoff on page and collect values in memory
+ * Collect data from a pending-list page in preparation for insertion into
+ * the main index.
+ *
+ * Go through all tuples >= startoff on page and collect values in accum
*
- * Note that da is just workspace --- it does not carry any state across
+ * Note that ka is just workspace --- it does not carry any state across
* calls.
*/
static void
-processPendingPage(BuildAccumulator *accum, DatumArray *da,
+processPendingPage(BuildAccumulator *accum, KeyArray *ka,
Page page, OffsetNumber startoff)
{
ItemPointerData heapptr;
OffsetNumber i,
maxoff;
- OffsetNumber attrnum,
- curattnum;
+ OffsetNumber attrnum;
- /* reset *da to empty */
- da->nvalues = 0;
+ /* reset *ka to empty */
+ ka->nvalues = 0;
maxoff = PageGetMaxOffsetNumber(page);
Assert(maxoff >= FirstOffsetNumber);
@@ -632,7 +664,11 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da,
for (i = startoff; i <= maxoff; i = OffsetNumberNext(i))
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ OffsetNumber curattnum;
+ Datum curkey;
+ GinNullCategory curcategory;
+ /* Check for change of heap TID or attnum */
curattnum = gintuple_get_attrnum(accum->ginstate, itup);
if (!ItemPointerIsValid(&heapptr))
@@ -644,18 +680,25 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da,
curattnum == attrnum))
{
/*
- * We can insert several datums per call, but only for one heap
- * tuple and one column.
+ * ginInsertBAEntries can insert several datums per call, but only
+ * for one heap tuple and one column. So call it at a boundary,
+ * and reset ka.
*/
- ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues);
- da->nvalues = 0;
+ ginInsertBAEntries(accum, &heapptr, attrnum,
+ ka->keys, ka->categories, ka->nvalues);
+ ka->nvalues = 0;
heapptr = itup->t_tid;
attrnum = curattnum;
}
- addDatum(da, gin_index_getattr(accum->ginstate, itup));
+
+ /* Add key to KeyArray */
+ curkey = gintuple_get_key(accum->ginstate, itup, &curcategory);
+ addDatum(ka, curkey, curcategory);
}
- ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues);
+ /* Dump out all remaining keys */
+ ginInsertBAEntries(accum, &heapptr, attrnum,
+ ka->keys, ka->categories, ka->nvalues);
}
/*
@@ -679,9 +722,10 @@ processPendingPage(BuildAccumulator *accum, DatumArray *da,
* If stats isn't null, we count deleted pending pages into the counts.
*/
void
-ginInsertCleanup(Relation index, GinState *ginstate,
+ginInsertCleanup(GinState *ginstate,
bool vac_delay, IndexBulkDeleteResult *stats)
{
+ Relation index = ginstate->index;
Buffer metabuffer,
buffer;
Page metapage,
@@ -690,7 +734,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
MemoryContext opCtx,
oldCtx;
BuildAccumulator accum;
- DatumArray datums;
+ KeyArray datums;
BlockNumber blkno;
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
@@ -726,10 +770,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
oldCtx = MemoryContextSwitchTo(opCtx);
- datums.maxvalues = 128;
- datums.nvalues = 0;
- datums.values = (Datum *) palloc(sizeof(Datum) * datums.maxvalues);
-
+ initKeyArray(&datums, 128);
ginInitBA(&accum);
accum.ginstate = ginstate;
@@ -748,7 +789,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
}
/*
- * read page's datums into memory
+ * read page's datums into accum
*/
processPendingPage(&accum, &datums, page, FirstOffsetNumber);
@@ -769,7 +810,8 @@ ginInsertCleanup(Relation index, GinState *ginstate,
{
ItemPointerData *list;
uint32 nlist;
- Datum entry;
+ Datum key;
+ GinNullCategory category;
OffsetNumber maxoff,
attnum;
@@ -786,9 +828,12 @@ ginInsertCleanup(Relation index, GinState *ginstate,
* significant amount of time - so, run it without locking pending
* list.
*/
- while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
+ ginBeginBAScan(&accum);
+ while ((list = ginGetBAEntry(&accum,
+ &attnum, &key, &category, &nlist)) != NULL)
{
- ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
+ ginEntryInsert(ginstate, attnum, key, category,
+ list, nlist, NULL);
if (vac_delay)
vacuum_delay_point();
}
@@ -820,8 +865,11 @@ ginInsertCleanup(Relation index, GinState *ginstate,
ginInitBA(&accum);
processPendingPage(&accum, &datums, page, maxoff + 1);
- while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
- ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
+ ginBeginBAScan(&accum);
+ while ((list = ginGetBAEntry(&accum,
+ &attnum, &key, &category, &nlist)) != NULL)
+ ginEntryInsert(ginstate, attnum, key, category,
+ list, nlist, NULL);
}
/*
@@ -855,9 +903,8 @@ ginInsertCleanup(Relation index, GinState *ginstate,
* release memory used so far and reinit state
*/
MemoryContextReset(opCtx);
+ initKeyArray(&datums, datums.maxvalues);
ginInitBA(&accum);
- datums.nvalues = 0;
- datums.values = (Datum *) palloc(sizeof(Datum) * datums.maxvalues);
}
else
{
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 705d167963..866785837f 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -4,17 +4,17 @@
* fetch tuples from a GIN scan.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.30 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gin/ginget.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "access/relscan.h"
#include "catalog/index.h"
#include "miscadmin.h"
@@ -34,25 +34,58 @@ typedef struct pendingPosition
/*
- * Tries to refind previously taken ItemPointer on page.
+ * Convenience function for invoking a key's consistentFn
*/
static bool
-findItemInPage(Page page, ItemPointer item, OffsetNumber *off)
+callConsistentFn(GinState *ginstate, GinScanKey key)
+{
+ /*
+ * If we're dealing with a dummy EVERYTHING key, we don't want to call the
+ * consistentFn; just claim it matches.
+ */
+ if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING)
+ {
+ key->recheckCurItem = false;
+ return true;
+ }
+
+ /*
+ * Initialize recheckCurItem in case the consistentFn doesn't know it
+ * should set it. The safe assumption in that case is to force recheck.
+ */
+ key->recheckCurItem = true;
+
+ return DatumGetBool(FunctionCall8Coll(&ginstate->consistentFn[key->attnum - 1],
+ ginstate->supportCollation[key->attnum - 1],
+ PointerGetDatum(key->entryRes),
+ UInt16GetDatum(key->strategy),
+ key->query,
+ UInt32GetDatum(key->nuserentries),
+ PointerGetDatum(key->extra_data),
+ PointerGetDatum(&key->recheckCurItem),
+ PointerGetDatum(key->queryValues),
+ PointerGetDatum(key->queryCategories)));
+}
+
+/*
+ * Tries to refind previously taken ItemPointer on a posting page.
+ */
+static bool
+findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off)
{
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
int res;
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
- /* page was deleted by concurrent vacuum */
+ /* page was deleted by concurrent vacuum */
return false;
/*
* scan page to find equal or first greater value
*/
-
for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
{
- res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
+ res = ginCompareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
if (res <= 0)
return true;
@@ -80,7 +113,9 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
return false; /* no more pages */
LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
+ stack->buffer = ReleaseAndReadBuffer(stack->buffer,
+ btree->index,
+ stack->blkno);
LockBuffer(stack->buffer, GIN_SHARE);
stack->off = FirstOffsetNumber;
}
@@ -89,71 +124,94 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
}
/*
- * Does fullscan of posting tree and saves ItemPointers
- * in scanEntry->partialMatch TIDBitmap
+ * Scan all pages of a posting tree and save all its heap ItemPointers
+ * in scanEntry->matchBitmap
*/
static void
-scanForItems(Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree)
+scanPostingTree(Relation index, GinScanEntry scanEntry,
+ BlockNumber rootPostingTree)
{
GinPostingTreeScan *gdi;
Buffer buffer;
Page page;
BlockNumber blkno;
- gdi = prepareScanPostingTree(index, rootPostingTree, TRUE);
+ /* Descend to the leftmost leaf page */
+ gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
- buffer = scanBeginPostingTree(gdi);
+ buffer = ginScanBeginPostingTree(gdi);
IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
freeGinBtreeStack(gdi->stack);
pfree(gdi);
/*
- * Goes through all leaves
+ * Loop iterates through all leaf pages of posting tree
*/
for (;;)
{
page = BufferGetPage(buffer);
- if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
+ if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 &&
+ GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
{
- tbm_add_tuples(scanEntry->partialMatch,
+ tbm_add_tuples(scanEntry->matchBitmap,
(ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber),
GinPageGetOpaque(page)->maxoff, false);
scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff;
}
- blkno = GinPageGetOpaque(page)->rightlink;
if (GinPageRightMost(page))
- {
- UnlockReleaseBuffer(buffer);
- return; /* no more pages */
- }
+ break; /* no more pages */
+ blkno = GinPageGetOpaque(page)->rightlink;
LockBuffer(buffer, GIN_UNLOCK);
buffer = ReleaseAndReadBuffer(buffer, index, blkno);
LockBuffer(buffer, GIN_SHARE);
}
+
+ UnlockReleaseBuffer(buffer);
}
/*
- * Collects all ItemPointer into the TIDBitmap struct
- * for entries partially matched to search entry.
+ * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
+ * match the search entry. This supports three different match modes:
*
- * Returns true if done, false if it's needed to restart scan from scratch
+ * 1. Partial-match support: scan from current point until the
+ * comparePartialFn says we're done.
+ * 2. SEARCH_MODE_ALL: scan from current point (which should be first
+ * key for the current attnum) until we hit null items or end of attnum
+ * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
+ * key for the current attnum) until we hit end of attnum
+ *
+ * Returns true if done, false if it's necessary to restart scan from scratch
*/
static bool
-computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry scanEntry)
+collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
+ GinScanEntry scanEntry)
{
- Page page;
- IndexTuple itup;
- Datum idatum;
- int32 cmp;
+ OffsetNumber attnum;
+ Form_pg_attribute attr;
+
+ /* Initialize empty bitmap result */
+ scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
- scanEntry->partialMatch = tbm_create(work_mem * 1024L);
+ /* Null query cannot partial-match anything */
+ if (scanEntry->isPartialMatch &&
+ scanEntry->queryCategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /* Locate tupdesc entry for key column (for attbyval/attlen data) */
+ attnum = scanEntry->attnum;
+ attr = btree->ginstate->origTupdesc->attrs[attnum - 1];
for (;;)
{
+ Page page;
+ IndexTuple itup;
+ Datum idatum;
+ GinNullCategory icategory;
+
/*
* stack->off points to the interested entry, buffer is already locked
*/
@@ -166,56 +224,85 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry
/*
* If tuple stores another attribute then stop scan
*/
- if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum)
+ if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
return true;
- idatum = gin_index_getattr(btree->ginstate, itup);
+ /* Safe to fetch attribute value */
+ idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
-
- /*----------
- * Check of partial match.
- * case cmp == 0 => match
- * case cmp > 0 => not match and finish scan
- * case cmp < 0 => not match and continue scan
- *----------
+ /*
+ * Check for appropriate scan stop conditions
*/
- cmp = DatumGetInt32(FunctionCall4(&btree->ginstate->comparePartialFn[scanEntry->attnum - 1],
- scanEntry->entry,
- idatum,
- UInt16GetDatum(scanEntry->strategy),
+ if (scanEntry->isPartialMatch)
+ {
+ int32 cmp;
+
+ /*
+ * In partial match, stop scan at any null (including
+ * placeholders); partial matches never match nulls
+ */
+ if (icategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /*----------
+ * Check of partial match.
+ * case cmp == 0 => match
+ * case cmp > 0 => not match and finish scan
+ * case cmp < 0 => not match and continue scan
+ *----------
+ */
+ cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
+ btree->ginstate->supportCollation[attnum - 1],
+ scanEntry->queryKey,
+ idatum,
+ UInt16GetDatum(scanEntry->strategy),
PointerGetDatum(scanEntry->extra_data)));
- if (cmp > 0)
- return true;
- else if (cmp < 0)
+ if (cmp > 0)
+ return true;
+ else if (cmp < 0)
+ {
+ stack->off++;
+ continue;
+ }
+ }
+ else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
{
- stack->off++;
- continue;
+ /*
+ * In ALL mode, we are not interested in null items, so we can
+ * stop if we get to a null-item placeholder (which will be the
+ * last entry for a given attnum). We do want to include NULL_KEY
+ * and EMPTY_ITEM entries, though.
+ */
+ if (icategory == GIN_CAT_NULL_ITEM)
+ return true;
}
+ /*
+ * OK, we want to return the TIDs listed in this entry.
+ */
if (GinIsPostingTree(itup))
{
BlockNumber rootPostingTree = GinGetPostingTree(itup);
- Datum newDatum,
- savedDatum = datumCopy(
- idatum,
- btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval,
- btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attlen
- );
/*
* We should unlock current page (but not unpin) during tree scan
* to prevent deadlock with vacuum processes.
*
- * We save current entry value (savedDatum) to be able to refind
- * our tuple after re-locking
+ * We save current entry value (idatum) to be able to re-find our
+ * tuple after re-locking
*/
+ if (icategory == GIN_CAT_NORM_KEY)
+ idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
+
LockBuffer(stack->buffer, GIN_UNLOCK);
- scanForItems(btree->index, scanEntry, rootPostingTree);
+
+ /* Collect all the TIDs in this entry's posting tree */
+ scanPostingTree(btree->index, scanEntry, rootPostingTree);
/*
* We lock again the entry page and while it was unlocked insert
- * might occured, so we need to refind our position
+ * might have occurred, so we need to re-find our position.
*/
LockBuffer(stack->buffer, GIN_SHARE);
page = BufferGetPage(stack->buffer);
@@ -223,44 +310,49 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry
{
/*
* Root page becomes non-leaf while we unlock it. We will
- * start again, this situation doesn't cause often - root can
- * became a non-leaf only one per life of index.
+ * start again, this situation doesn't occur often - root can
+ * became a non-leaf only once per life of index.
*/
-
return false;
}
+ /* Search forward to re-find idatum */
for (;;)
{
+ Datum newDatum;
+ GinNullCategory newCategory;
+
if (moveRightIfItNeeded(btree, stack) == false)
elog(ERROR, "lost saved point in index"); /* must not happen !!! */
page = BufferGetPage(stack->buffer);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
- newDatum = gin_index_getattr(btree->ginstate, itup);
- if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum)
+ if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
elog(ERROR, "lost saved point in index"); /* must not happen !!! */
+ newDatum = gintuple_get_key(btree->ginstate, itup,
+ &newCategory);
- if (compareEntries(btree->ginstate, scanEntry->attnum, newDatum, savedDatum) == 0)
- {
- /* Found! */
- if (btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval == false)
- pfree(DatumGetPointer(savedDatum));
- break;
- }
+ if (ginCompareEntries(btree->ginstate, attnum,
+ newDatum, newCategory,
+ idatum, icategory) == 0)
+ break; /* Found! */
stack->off++;
}
+
+ if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
+ pfree(DatumGetPointer(idatum));
}
else
{
- tbm_add_tuples(scanEntry->partialMatch, GinGetPosting(itup), GinGetNPosting(itup), false);
+ tbm_add_tuples(scanEntry->matchBitmap,
+ GinGetPosting(itup), GinGetNPosting(itup), false);
scanEntry->predictNumberResult += GinGetNPosting(itup);
}
/*
- * Ok, we save ItemPointers, go to the next entry
+ * Done with this entry, go to the next
*/
stack->off++;
}
@@ -272,72 +364,72 @@ computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry
* Start* functions setup beginning state of searches: finds correct buffer and pins it.
*/
static void
-startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
+startScanEntry(GinState *ginstate, GinScanEntry entry)
{
GinBtreeData btreeEntry;
GinBtreeStack *stackEntry;
Page page;
- bool needUnlock = TRUE;
+ bool needUnlock;
+restartScanEntry:
entry->buffer = InvalidBuffer;
+ ItemPointerSetMin(&entry->curItem);
entry->offset = InvalidOffsetNumber;
entry->list = NULL;
entry->nlist = 0;
- entry->partialMatch = NULL;
- entry->partialMatchResult = NULL;
+ entry->matchBitmap = NULL;
+ entry->matchResult = NULL;
entry->reduceResult = FALSE;
entry->predictNumberResult = 0;
- if (entry->master != NULL)
- {
- entry->isFinished = entry->master->isFinished;
- return;
- }
-
/*
* we should find entry, and begin scan of posting tree or just store
* posting list in memory
*/
-
- prepareEntryScan(&btreeEntry, index, entry->attnum, entry->entry, ginstate);
+ ginPrepareEntryScan(&btreeEntry, entry->attnum,
+ entry->queryKey, entry->queryCategory,
+ ginstate);
btreeEntry.searchMode = TRUE;
stackEntry = ginFindLeafPage(&btreeEntry, NULL);
page = BufferGetPage(stackEntry->buffer);
+ needUnlock = TRUE;
entry->isFinished = TRUE;
- if (entry->isPartialMatch)
+ if (entry->isPartialMatch ||
+ entry->queryCategory == GIN_CAT_EMPTY_QUERY)
{
/*
- * btreeEntry.findItem points to the first equal or greater value than
- * needed. So we will scan further and collect all ItemPointers
+ * btreeEntry.findItem locates the first item >= given search key.
+ * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
+ * because of the way the GIN_CAT_EMPTY_QUERY category code is
+ * assigned.) We scan forward from there and collect all TIDs needed
+ * for the entry type.
*/
btreeEntry.findItem(&btreeEntry, stackEntry);
- if (computePartialMatchList(&btreeEntry, stackEntry, entry) == false)
+ if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false)
{
/*
* GIN tree was seriously restructured, so we will cleanup all
* found data and rescan. See comments near 'return false' in
- * computePartialMatchList()
+ * collectMatchBitmap()
*/
- if (entry->partialMatch)
+ if (entry->matchBitmap)
{
- if (entry->partialMatchIterator)
- tbm_end_iterate(entry->partialMatchIterator);
- entry->partialMatchIterator = NULL;
- tbm_free(entry->partialMatch);
- entry->partialMatch = NULL;
+ if (entry->matchIterator)
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
+ tbm_free(entry->matchBitmap);
+ entry->matchBitmap = NULL;
}
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
freeGinBtreeStack(stackEntry);
-
- startScanEntry(index, ginstate, entry);
- return;
+ goto restartScanEntry;
}
- if (entry->partialMatch && !tbm_is_empty(entry->partialMatch))
+ if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
{
- entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch);
+ entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
entry->isFinished = FALSE;
}
}
@@ -352,17 +444,17 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
Page page;
/*
- * We should unlock entry page before make deal with posting tree
- * to prevent deadlocks with vacuum processes. Because entry is
- * never deleted from page and posting tree is never reduced to
- * the posting list, we can unlock page after getting BlockNumber
- * of root of posting tree.
+ * We should unlock entry page before touching posting tree to
+ * prevent deadlocks with vacuum processes. Because entry is never
+ * deleted from page and posting tree is never reduced to the
+ * posting list, we can unlock page after getting BlockNumber of
+ * root of posting tree.
*/
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
needUnlock = FALSE;
- gdi = prepareScanPostingTree(index, rootPostingTree, TRUE);
+ gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, TRUE);
- entry->buffer = scanBeginPostingTree(gdi);
+ entry->buffer = ginScanBeginPostingTree(gdi);
/*
* We keep buffer pinned because we need to prevent deletion of
@@ -402,19 +494,23 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
}
static void
-startScanKey(Relation index, GinState *ginstate, GinScanKey key)
+startScanKey(GinState *ginstate, GinScanKey key)
{
- uint32 i;
-
- if (!key->firstCall)
- return;
+ ItemPointerSetMin(&key->curItem);
+ key->curItemMatches = false;
+ key->recheckCurItem = false;
+ key->isFinished = false;
+}
- for (i = 0; i < key->nentries; i++)
- startScanEntry(index, ginstate, key->scanEntry + i);
+static void
+startScan(IndexScanDesc scan)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ GinState *ginstate = &so->ginstate;
+ uint32 i;
- memset(key->entryRes, TRUE, sizeof(bool) * key->nentries);
- key->isFinished = FALSE;
- key->firstCall = FALSE;
+ for (i = 0; i < so->totalentries; i++)
+ startScanEntry(ginstate, so->entries[i]);
if (GinFuzzySearchLimit > 0)
{
@@ -425,27 +521,20 @@ startScanKey(Relation index, GinState *ginstate, GinScanKey key)
* minimal predictNumberResult.
*/
- for (i = 0; i < key->nentries; i++)
- if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit)
+ for (i = 0; i < so->totalentries; i++)
+ if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
return;
- for (i = 0; i < key->nentries; i++)
- if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit)
+ for (i = 0; i < so->totalentries; i++)
+ if (so->entries[i]->predictNumberResult > so->totalentries * GinFuzzySearchLimit)
{
- key->scanEntry[i].predictNumberResult /= key->nentries;
- key->scanEntry[i].reduceResult = TRUE;
+ so->entries[i]->predictNumberResult /= so->totalentries;
+ so->entries[i]->reduceResult = TRUE;
}
}
-}
-
-static void
-startScan(IndexScanDesc scan)
-{
- uint32 i;
- GinScanOpaque so = (GinScanOpaque) scan->opaque;
for (i = 0; i < so->nkeys; i++)
- startScanKey(scan->indexRelation, &so->ginstate, so->keys + i);
+ startScanKey(ginstate, so->keys + i);
}
/*
@@ -454,18 +543,16 @@ startScan(IndexScanDesc scan)
* to prevent interference with vacuum
*/
static void
-entryGetNextItem(Relation index, GinScanEntry entry)
+entryGetNextItem(GinState *ginstate, GinScanEntry entry)
{
Page page;
BlockNumber blkno;
for (;;)
{
- entry->offset++;
-
- if (entry->offset <= entry->nlist)
+ if (entry->offset < entry->nlist)
{
- entry->curItem = entry->list[entry->offset - 1];
+ entry->curItem = entry->list[entry->offset++];
return;
}
@@ -484,18 +571,21 @@ entryGetNextItem(Relation index, GinScanEntry entry)
if (blkno == InvalidBlockNumber)
{
ReleaseBuffer(entry->buffer);
- ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
+ ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
- entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno);
+ entry->buffer = ReleaseAndReadBuffer(entry->buffer,
+ ginstate->index,
+ blkno);
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber;
- if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset))
+ if (!ItemPointerIsValid(&entry->curItem) ||
+ findItemInPostingPage(page, &entry->curItem, &entry->offset))
{
/*
* Found position equal to or greater than stored
@@ -507,13 +597,13 @@ entryGetNextItem(Relation index, GinScanEntry entry)
LockBuffer(entry->buffer, GIN_UNLOCK);
if (!ItemPointerIsValid(&entry->curItem) ||
- compareItemPointers(&entry->curItem, entry->list + entry->offset - 1) == 0)
+ ginCompareItemPointers(&entry->curItem,
+ entry->list + entry->offset - 1) == 0)
{
/*
* First pages are deleted or empty, or we found exact
* position, so break inner loop and continue outer one.
*/
-
break;
}
@@ -532,52 +622,57 @@ entryGetNextItem(Relation index, GinScanEntry entry)
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/*
- * Sets entry->curItem to new found heap item pointer for one
- * entry of one scan key
+ * Sets entry->curItem to next heap item pointer for one entry of one scan key,
+ * or sets entry->isFinished to TRUE if there are no more.
+ *
+ * Item pointers must be returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * entry potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
+ * current implementation this is guaranteed by the behavior of tidbitmaps.
*/
-static bool
-entryGetItem(Relation index, GinScanEntry entry)
+static void
+entryGetItem(GinState *ginstate, GinScanEntry entry)
{
- if (entry->master)
- {
- entry->isFinished = entry->master->isFinished;
- entry->curItem = entry->master->curItem;
- }
- else if (entry->partialMatch)
+ Assert(!entry->isFinished);
+
+ if (entry->matchBitmap)
{
do
{
- if (entry->partialMatchResult == NULL ||
- entry->offset >= entry->partialMatchResult->ntuples)
+ if (entry->matchResult == NULL ||
+ entry->offset >= entry->matchResult->ntuples)
{
- entry->partialMatchResult = tbm_iterate(entry->partialMatchIterator);
+ entry->matchResult = tbm_iterate(entry->matchIterator);
- if (entry->partialMatchResult == NULL)
+ if (entry->matchResult == NULL)
{
- ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
- tbm_end_iterate(entry->partialMatchIterator);
- entry->partialMatchIterator = NULL;
+ ItemPointerSetInvalid(&entry->curItem);
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
entry->isFinished = TRUE;
break;
}
/*
- * reset counter to the beginning of
- * entry->partialMatchResult. Note: entry->offset is still
- * greater than partialMatchResult->ntuples if
- * partialMatchResult is lossy. So, on next call we will get
- * next result from TIDBitmap.
+ * Reset counter to the beginning of entry->matchResult. Note:
+ * entry->offset is still greater than matchResult->ntuples if
+ * matchResult is lossy. So, on next call we will get next
+ * result from TIDBitmap.
*/
entry->offset = 0;
}
- if (entry->partialMatchResult->ntuples < 0)
+ if (entry->matchResult->ntuples < 0)
{
/*
* lossy result, so we need to check the whole page
*/
ItemPointerSetLossyPage(&entry->curItem,
- entry->partialMatchResult->blockno);
+ entry->matchResult->blockno);
/*
* We might as well fall out of the loop; we could not
@@ -588,8 +683,8 @@ entryGetItem(Relation index, GinScanEntry entry)
}
ItemPointerSet(&entry->curItem,
- entry->partialMatchResult->blockno,
- entry->partialMatchResult->offsets[entry->offset]);
+ entry->matchResult->blockno,
+ entry->matchResult->offsets[entry->offset]);
entry->offset++;
} while (entry->reduceResult == TRUE && dropItem(entry));
}
@@ -600,7 +695,7 @@ entryGetItem(Relation index, GinScanEntry entry)
entry->curItem = entry->list[entry->offset - 1];
else
{
- ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
+ ItemPointerSetInvalid(&entry->curItem);
entry->isFinished = TRUE;
}
}
@@ -608,137 +703,350 @@ entryGetItem(Relation index, GinScanEntry entry)
{
do
{
- entryGetNextItem(index, entry);
- } while (entry->isFinished == FALSE && entry->reduceResult == TRUE && dropItem(entry));
+ entryGetNextItem(ginstate, entry);
+ } while (entry->isFinished == FALSE &&
+ entry->reduceResult == TRUE &&
+ dropItem(entry));
}
-
- return entry->isFinished;
}
/*
- * Sets key->curItem to new found heap item pointer for one scan key
- * Returns isFinished, ie TRUE means we did NOT get a new item pointer!
- * Also, *keyrecheck is set true if recheck is needed for this scan key.
- * Note: lossy page could be returned after items from the same page.
+ * Identify the "current" item among the input entry streams for this scan key,
+ * and test whether it passes the scan key qual condition.
+ *
+ * The current item is the smallest curItem among the inputs. key->curItem
+ * is set to that value. key->curItemMatches is set to indicate whether that
+ * TID passes the consistentFn test. If so, key->recheckCurItem is set true
+ * iff recheck is needed for this item pointer (including the case where the
+ * item pointer is a lossy page pointer).
+ *
+ * If all entry streams are exhausted, sets key->isFinished to TRUE.
+ *
+ * Item pointers must be returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * key potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in scanGetItem.)
*/
-static bool
-keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx,
- GinScanKey key, bool *keyrecheck)
+static void
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
{
+ ItemPointerData minItem;
+ ItemPointerData curPageLossy;
uint32 i;
+ uint32 lossyEntry;
+ bool haveLossyEntry;
GinScanEntry entry;
bool res;
MemoryContext oldCtx;
- if (key->isFinished)
- return TRUE;
+ Assert(!key->isFinished);
- do
+ /*
+ * Find the minimum of the active entry curItems.
+ *
+ * Note: a lossy-page entry is encoded by a ItemPointer with max value for
+ * offset (0xffff), so that it will sort after any exact entries for the
+ * same page. So we'll prefer to return exact pointers not lossy
+ * pointers, which is good.
+ */
+ ItemPointerSetMax(&minItem);
+
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished == FALSE &&
+ ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ minItem = entry->curItem;
+ }
+
+ if (ItemPointerIsMax(&minItem))
+ {
+ /* all entries are finished */
+ key->isFinished = TRUE;
+ return;
+ }
+
+ /*
+ * We might have already tested this item; if so, no need to repeat work.
+ * (Note: the ">" case can happen, if minItem is exact but we previously
+ * had to set curItem to a lossy-page pointer.)
+ */
+ if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
+ return;
+
+ /*
+ * OK, advance key->curItem and perform consistentFn test.
+ */
+ key->curItem = minItem;
+
+ /*
+ * Lossy-page entries pose a problem, since we don't know the correct
+ * entryRes state to pass to the consistentFn, and we also don't know what
+ * its combining logic will be (could be AND, OR, or even NOT). If the
+ * logic is OR then the consistentFn might succeed for all items in the
+ * lossy page even when none of the other entries match.
+ *
+ * If we have a single lossy-page entry then we check to see if the
+ * consistentFn will succeed with only that entry TRUE. If so, we return
+ * a lossy-page pointer to indicate that the whole heap page must be
+ * checked. (On subsequent calls, we'll do nothing until minItem is past
+ * the page altogether, thus ensuring that we never return both regular
+ * and lossy pointers for the same page.)
+ *
+ * This idea could be generalized to more than one lossy-page entry, but
+ * ideally lossy-page entries should be infrequent so it would seldom be
+ * the case that we have more than one at once. So it doesn't seem worth
+ * the extra complexity to optimize that case. If we do find more than
+ * one, we just punt and return a lossy-page pointer always.
+ *
+ * Note that only lossy-page entries pointing to the current item's page
+ * should trigger this processing; we might have future lossy pages in the
+ * entry array, but they aren't relevant yet.
+ */
+ ItemPointerSetLossyPage(&curPageLossy,
+ GinItemPointerGetBlockNumber(&key->curItem));
+
+ lossyEntry = 0;
+ haveLossyEntry = false;
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished == FALSE &&
+ ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ {
+ if (haveLossyEntry)
+ {
+ /* Multiple lossy entries, punt */
+ key->curItem = curPageLossy;
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ return;
+ }
+ lossyEntry = i;
+ haveLossyEntry = true;
+ }
+ }
+
+ /* prepare for calling consistentFn in temp context */
+ oldCtx = MemoryContextSwitchTo(tempCtx);
+
+ if (haveLossyEntry)
+ {
+ /* Single lossy-page entry, so see if whole page matches */
+ memset(key->entryRes, FALSE, key->nentries);
+ key->entryRes[lossyEntry] = TRUE;
+
+ if (callConsistentFn(ginstate, key))
+ {
+ /* Yes, so clean up ... */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+
+ /* and return lossy pointer for whole page */
+ key->curItem = curPageLossy;
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ return;
+ }
+ }
+
+ /*
+ * At this point we know that we don't need to return a lossy whole-page
+ * pointer, but we might have matches for individual exact item pointers,
+ * possibly in combination with a lossy pointer. Our strategy if there's
+ * a lossy pointer is to try the consistentFn both ways and return a hit
+ * if it accepts either one (forcing the hit to be marked lossy so it will
+ * be rechecked). An exception is that we don't need to try it both ways
+ * if the lossy pointer is in a "hidden" entry, because the consistentFn's
+ * result can't depend on that.
+ *
+ * Prepare entryRes array to be passed to consistentFn.
+ */
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished == FALSE &&
+ ginCompareItemPointers(&entry->curItem, &key->curItem) == 0)
+ key->entryRes[i] = TRUE;
+ else
+ key->entryRes[i] = FALSE;
+ }
+ if (haveLossyEntry)
+ key->entryRes[lossyEntry] = TRUE;
+
+ res = callConsistentFn(ginstate, key);
+
+ if (!res && haveLossyEntry && lossyEntry < key->nuserentries)
+ {
+ /* try the other way for the lossy item */
+ key->entryRes[lossyEntry] = FALSE;
+
+ res = callConsistentFn(ginstate, key);
+ }
+
+ key->curItemMatches = res;
+ /* If we matched a lossy entry, force recheckCurItem = true */
+ if (haveLossyEntry)
+ key->recheckCurItem = true;
+
+ /* clean up after consistentFn calls */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+}
+
+/*
+ * Get next heap item pointer (after advancePast) from scan.
+ * Returns true if anything found.
+ * On success, *item and *recheck are set.
+ *
+ * Note: this is very nearly the same logic as in keyGetItem(), except
+ * that we know the keys are to be combined with AND logic, whereas in
+ * keyGetItem() the combination logic is known only to the consistentFn.
+ */
+static bool
+scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
+ ItemPointerData *item, bool *recheck)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ GinState *ginstate = &so->ginstate;
+ ItemPointerData myAdvancePast = *advancePast;
+ uint32 i;
+ bool allFinished;
+ bool match;
+
+ for (;;)
{
/*
- * move forward from previously value and set new curItem, which is
- * minimal from entries->curItems. Lossy page is encoded by
- * ItemPointer with max value for offset (0xffff), so if there is an
- * non-lossy entries on lossy page they will returned too and after
- * that the whole page. That's not a problem for resulting tidbitmap.
+ * Advance any entries that are <= myAdvancePast. In particular,
+ * since entry->curItem was initialized with ItemPointerSetMin, this
+ * ensures we fetch the first item for each entry on the first call.
*/
- ItemPointerSetMax(&key->curItem);
- for (i = 0; i < key->nentries; i++)
+ allFinished = TRUE;
+
+ for (i = 0; i < so->totalentries; i++)
{
- entry = key->scanEntry + i;
+ GinScanEntry entry = so->entries[i];
- if (key->entryRes[i])
- {
- /*
- * Move forward only entries which was the least on previous
- * call, key->entryRes[i] points that current entry was a
- * result of loop/call.
- */
- if (entry->isFinished == FALSE && entryGetItem(index, entry) == FALSE)
- {
- if (compareItemPointers(&entry->curItem, &key->curItem) < 0)
- key->curItem = entry->curItem;
- }
- else
- key->entryRes[i] = FALSE;
- }
- else if (entry->isFinished == FALSE)
- {
- if (compareItemPointers(&entry->curItem, &key->curItem) < 0)
- key->curItem = entry->curItem;
- }
+ while (entry->isFinished == FALSE &&
+ ginCompareItemPointers(&entry->curItem,
+ &myAdvancePast) <= 0)
+ entryGetItem(ginstate, entry);
+
+ if (entry->isFinished == FALSE)
+ allFinished = FALSE;
}
- if (ItemPointerIsMax(&key->curItem))
+ if (allFinished)
{
- /* all entries are finished */
- key->isFinished = TRUE;
- return TRUE;
+ /* all entries exhausted, so we're done */
+ return false;
}
/*
- * Now key->curItem contains closest ItemPointer to previous result.
- *
- * if key->nentries == 1 then the consistentFn should always succeed,
- * but we must call it anyway to find out the recheck status.
+ * Perform the consistentFn test for each scan key. If any key
+ * reports isFinished, meaning its subset of the entries is exhausted,
+ * we can stop. Otherwise, set *item to the minimum of the key
+ * curItems.
*/
+ ItemPointerSetMax(item);
+
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ keyGetItem(&so->ginstate, so->tempCtx, key);
+
+ if (key->isFinished)
+ return false; /* finished one of keys */
+
+ if (ginCompareItemPointers(&key->curItem, item) < 0)
+ *item = key->curItem;
+ }
+
+ Assert(!ItemPointerIsMax(item));
/*----------
- * entryRes array is used for:
- * - as an argument for consistentFn
- * - entry->curItem with corresponding key->entryRes[i] == false are
- * greater than key->curItem, so next loop/call they should be
- * renewed by entryGetItem(). So, we need to set up an array before
- * checking of lossy page.
+ * Now *item contains first ItemPointer after previous result.
+ *
+ * The item is a valid hit only if all the keys succeeded for either
+ * that exact TID, or a lossy reference to the same page.
+ *
+ * This logic works only if a keyGetItem stream can never contain both
+ * exact and lossy pointers for the same page. Else we could have a
+ * case like
+ *
+ * stream 1 stream 2
+ * ... ...
+ * 42/6 42/7
+ * 50/1 42/0xffff
+ * ... ...
+ *
+ * We would conclude that 42/6 is not a match and advance stream 1,
+ * thus never detecting the match to the lossy pointer in stream 2.
+ * (keyGetItem has a similar problem versus entryGetItem.)
*----------
*/
- for (i = 0; i < key->nentries; i++)
+ match = true;
+ for (i = 0; i < so->nkeys; i++)
{
- entry = key->scanEntry + i;
+ GinScanKey key = so->keys + i;
- if (entry->isFinished == FALSE &&
- compareItemPointers(&entry->curItem, &key->curItem) == 0)
- key->entryRes[i] = TRUE;
- else
- key->entryRes[i] = FALSE;
+ if (key->curItemMatches)
+ {
+ if (ginCompareItemPointers(item, &key->curItem) == 0)
+ continue;
+ if (ItemPointerIsLossyPage(&key->curItem) &&
+ GinItemPointerGetBlockNumber(&key->curItem) ==
+ GinItemPointerGetBlockNumber(item))
+ continue;
+ }
+ match = false;
+ break;
}
- /*
- * Initialize *keyrecheck in case the consistentFn doesn't know it
- * should set it. The safe assumption in that case is to force
- * recheck.
- */
- *keyrecheck = true;
+ if (match)
+ break;
/*
- * If one of the entry's scans returns lossy result, return it without
- * further checking - we can't call consistentFn for lack of data.
+ * No hit. Update myAdvancePast to this TID, so that on the next pass
+ * we'll move to the next possible entry.
*/
- if (ItemPointerIsLossyPage(&key->curItem))
- return FALSE;
-
- oldCtx = MemoryContextSwitchTo(tempCtx);
- res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1],
- PointerGetDatum(key->entryRes),
- UInt16GetDatum(key->strategy),
- key->query,
- UInt32GetDatum(key->nentries),
- PointerGetDatum(key->extra_data),
- PointerGetDatum(keyrecheck)));
- MemoryContextSwitchTo(oldCtx);
- MemoryContextReset(tempCtx);
- } while (!res);
+ myAdvancePast = *item;
+ }
- return FALSE;
+ /*
+ * We must return recheck = true if any of the keys are marked recheck.
+ */
+ *recheck = false;
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ if (key->recheckCurItem)
+ {
+ *recheck = true;
+ break;
+ }
+ }
+
+ return TRUE;
}
/*
+ * Functions for scanning the pending list
+ */
+
+
+/*
* Get ItemPointer of next heap row to be checked from pending list.
- * Returns false if there are no more. On pages with several rows
+ * Returns false if there are no more. On pages with several heap rows
* it returns each row separately, on page with part of heap row returns
- * per page data. pos->firstOffset and pos->lastOffset points
- * fraction of tuples for current heap row.
+ * per page data. pos->firstOffset and pos->lastOffset are set to identify
+ * the range of pending-list tuples belonging to this heap row.
*
* The pendingBuffer is presumed pinned and share-locked on entry, and is
* pinned and share-locked on success exit. On failure exit it's released.
@@ -810,10 +1118,9 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
/*
* Now pos->firstOffset points to the first tuple of current heap
- * row, pos->lastOffset points to the first tuple of second heap
- * row (or to the end of page)
+ * row, pos->lastOffset points to the first tuple of next heap row
+ * (or to the end of page)
*/
-
break;
}
}
@@ -822,35 +1129,47 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
}
/*
- * Scan page from current tuple (off) up till the first of:
+ * Scan pending-list page from current tuple (off) up till the first of:
* - match is found (then returns true)
* - no later match is possible
* - tuple's attribute number is not equal to entry's attrnum
* - reach end of page
+ *
+ * datum[]/category[]/datumExtracted[] arrays are used to cache the results
+ * of gintuple_get_key() on the current page.
*/
static bool
matchPartialInPendingList(GinState *ginstate, Page page,
OffsetNumber off, OffsetNumber maxoff,
- Datum value, OffsetNumber attrnum,
- Datum *datum, bool *datumExtracted,
- StrategyNumber strategy,
- Pointer extra_data)
+ GinScanEntry entry,
+ Datum *datum, GinNullCategory *category,
+ bool *datumExtracted)
{
IndexTuple itup;
int32 cmp;
+ /* Partial match to a null is not possible */
+ if (entry->queryCategory != GIN_CAT_NORM_KEY)
+ return false;
+
while (off < maxoff)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
- if (attrnum != gintuple_get_attrnum(ginstate, itup))
+
+ if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
return false;
if (datumExtracted[off - 1] == false)
{
- datum[off - 1] = gin_index_getattr(ginstate, itup);
+ datum[off - 1] = gintuple_get_key(ginstate, itup,
+ &category[off - 1]);
datumExtracted[off - 1] = true;
}
+ /* Once we hit nulls, no further match is possible */
+ if (category[off - 1] != GIN_CAT_NORM_KEY)
+ return false;
+
/*----------
* Check partial match.
* case cmp == 0 => match
@@ -858,11 +1177,12 @@ matchPartialInPendingList(GinState *ginstate, Page page,
* case cmp < 0 => not match and continue scan
*----------
*/
- cmp = DatumGetInt32(FunctionCall4(&ginstate->comparePartialFn[attrnum - 1],
- value,
- datum[off - 1],
- UInt16GetDatum(strategy),
- PointerGetDatum(extra_data)));
+ cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
+ ginstate->supportCollation[entry->attnum - 1],
+ entry->queryKey,
+ datum[off - 1],
+ UInt16GetDatum(entry->strategy),
+ PointerGetDatum(entry->extra_data)));
if (cmp == 0)
return true;
else if (cmp > 0)
@@ -874,27 +1194,20 @@ matchPartialInPendingList(GinState *ginstate, Page page,
return false;
}
-static bool
-hasAllMatchingKeys(GinScanOpaque so, pendingPosition *pos)
-{
- int i;
-
- for (i = 0; i < so->nkeys; i++)
- if (pos->hasMatchKey[i] == false)
- return false;
-
- return true;
-}
-
/*
- * Sets entryRes array for each key by looking at
- * every entry per indexed value (heap's row) in pending list.
- * returns true if at least one of datum was matched by key's entry
+ * Set up the entryRes array for each key by looking at
+ * every entry for current heap row in pending list.
+ *
+ * Returns true if each scan key has at least one entryRes match.
+ * This corresponds to the situations where the normal index search will
+ * try to apply the key's consistentFn. (A tuple not meeting that requirement
+ * cannot be returned by the normal search since no entry stream will
+ * source its TID.)
*
* The pendingBuffer is presumed pinned and share-locked on entry.
*/
static bool
-collectDatumForItem(IndexScanDesc scan, pendingPosition *pos)
+collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
{
GinScanOpaque so = (GinScanOpaque) scan->opaque;
OffsetNumber attrnum;
@@ -904,7 +1217,7 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos)
j;
/*
- * Resets entryRes
+ * Reset all entryRes and hasMatchKey flags
*/
for (i = 0; i < so->nkeys; i++)
{
@@ -914,13 +1227,19 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos)
}
memset(pos->hasMatchKey, FALSE, so->nkeys);
+ /*
+ * Outer loop iterates over multiple pending-list pages when a single heap
+ * row has entries spanning those pages.
+ */
for (;;)
{
Datum datum[BLCKSZ / sizeof(IndexTupleData)];
+ GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
Assert(pos->lastOffset > pos->firstOffset);
- memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset));
+ memset(datumExtracted + pos->firstOffset - 1, 0,
+ sizeof(bool) * (pos->lastOffset - pos->firstOffset));
page = BufferGetPage(pos->pendingBuffer);
@@ -930,128 +1249,174 @@ collectDatumForItem(IndexScanDesc scan, pendingPosition *pos)
for (j = 0; j < key->nentries; j++)
{
+ GinScanEntry entry = key->scanEntry[j];
OffsetNumber StopLow = pos->firstOffset,
StopHigh = pos->lastOffset,
StopMiddle;
- GinScanEntry entry = key->scanEntry + j;
- /* already true - do not extra work */
+ /* If already matched on earlier page, do no extra work */
if (key->entryRes[j])
continue;
/*
- * Interested tuples are from pos->firstOffset to
+ * Interesting tuples are from pos->firstOffset to
* pos->lastOffset and they are ordered by (attnum, Datum) as
- * it's done in entry tree So we could use binary search to
- * prevent linear scanning
+ * it's done in entry tree. So we can use binary search to
+ * avoid linear scanning.
*/
while (StopLow < StopHigh)
{
+ int res;
+
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
+
attrnum = gintuple_get_attrnum(&so->ginstate, itup);
if (key->attnum < attrnum)
+ {
StopHigh = StopMiddle;
- else if (key->attnum > attrnum)
+ continue;
+ }
+ if (key->attnum > attrnum)
+ {
StopLow = StopMiddle + 1;
- else
+ continue;
+ }
+
+ if (datumExtracted[StopMiddle - 1] == false)
{
- int res;
+ datum[StopMiddle - 1] =
+ gintuple_get_key(&so->ginstate, itup,
+ &category[StopMiddle - 1]);
+ datumExtracted[StopMiddle - 1] = true;
+ }
- if (datumExtracted[StopMiddle - 1] == false)
+ if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
+ {
+ /* special behavior depending on searchMode */
+ if (entry->searchMode == GIN_SEARCH_MODE_ALL)
{
- datum[StopMiddle - 1] = gin_index_getattr(&so->ginstate, itup);
- datumExtracted[StopMiddle - 1] = true;
+ /* match anything except NULL_ITEM */
+ if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
+ res = -1;
+ else
+ res = 0;
}
- res = compareEntries(&so->ginstate,
- entry->attnum,
- entry->entry,
- datum[StopMiddle - 1]);
-
- if (res == 0)
+ else
{
- /*
- * The exact match causes, so we just scan from
- * current position to find a partial match. See
- * comment above about tuple's ordering.
- */
- if (entry->isPartialMatch)
- key->entryRes[j] =
- matchPartialInPendingList(&so->ginstate,
- page, StopMiddle,
- pos->lastOffset,
- entry->entry,
- entry->attnum,
- datum,
- datumExtracted,
- entry->strategy,
- entry->extra_data);
- else
- key->entryRes[j] = true;
- break;
+ /* match everything */
+ res = 0;
}
- else if (res < 0)
- StopHigh = StopMiddle;
+ }
+ else
+ {
+ res = ginCompareEntries(&so->ginstate,
+ entry->attnum,
+ entry->queryKey,
+ entry->queryCategory,
+ datum[StopMiddle - 1],
+ category[StopMiddle - 1]);
+ }
+
+ if (res == 0)
+ {
+ /*
+ * Found exact match (there can be only one, except in
+ * EMPTY_QUERY mode).
+ *
+ * If doing partial match, scan forward from here to
+ * end of page to check for matches.
+ *
+ * See comment above about tuple's ordering.
+ */
+ if (entry->isPartialMatch)
+ key->entryRes[j] =
+ matchPartialInPendingList(&so->ginstate,
+ page,
+ StopMiddle,
+ pos->lastOffset,
+ entry,
+ datum,
+ category,
+ datumExtracted);
else
- StopLow = StopMiddle + 1;
+ key->entryRes[j] = true;
+
+ /* done with binary search */
+ break;
}
+ else if (res < 0)
+ StopHigh = StopMiddle;
+ else
+ StopLow = StopMiddle + 1;
}
if (StopLow >= StopHigh && entry->isPartialMatch)
{
/*
- * The exact match wasn't found, so we need to start scan
- * from first tuple greater then current entry See comment
- * above about tuple's ordering.
+ * No exact match on this page. If doing partial match,
+ * scan from the first tuple greater than target value to
+ * end of page. Note that since we don't remember whether
+ * the comparePartialFn told us to stop early on a
+ * previous page, we will uselessly apply comparePartialFn
+ * to the first tuple on each subsequent page.
*/
key->entryRes[j] =
matchPartialInPendingList(&so->ginstate,
- page, StopHigh,
+ page,
+ StopHigh,
pos->lastOffset,
- entry->entry,
- entry->attnum,
+ entry,
datum,
- datumExtracted,
- entry->strategy,
- entry->extra_data);
+ category,
+ datumExtracted);
}
pos->hasMatchKey[i] |= key->entryRes[j];
}
}
+ /* Advance firstOffset over the scanned tuples */
pos->firstOffset = pos->lastOffset;
if (GinPageHasFullRow(page))
{
/*
- * We scan all values from one tuple, go to next one
+ * We have examined all pending entries for the current heap row.
+ * Break out of loop over pages.
*/
-
- return hasAllMatchingKeys(so, pos);
+ break;
}
else
{
- ItemPointerData item = pos->item;
-
/*
- * need to get next portion of tuples of row containing on several
- * pages
+ * Advance to next page of pending entries for the current heap
+ * row. Complain if there isn't one.
*/
+ ItemPointerData item = pos->item;
- if (scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item))
- elog(ERROR, "Could not process tuple"); /* XXX should not be
- * here ! */
+ if (scanGetCandidate(scan, pos) == false ||
+ !ItemPointerEquals(&pos->item, &item))
+ elog(ERROR, "could not find additional pending pages for same heap tuple");
}
}
- return hasAllMatchingKeys(so, pos);
+ /*
+ * Now return "true" if all scan keys have at least one matching datum
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ if (pos->hasMatchKey[i] == false)
+ return false;
+ }
+
+ return true;
}
/*
- * Collect all matched rows from pending list in bitmap
+ * Collect all matched rows from pending list into bitmap
*/
static void
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
@@ -1059,7 +1424,6 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
GinScanOpaque so = (GinScanOpaque) scan->opaque;
MemoryContext oldCtx;
bool recheck,
- keyrecheck,
match;
int i;
pendingPosition pos;
@@ -1095,16 +1459,18 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
while (scanGetCandidate(scan, &pos))
{
/*
- * Check entries in tuple and setup entryRes array If tuples of heap's
- * row are placed on several pages collectDatumForItem will read all
- * of that pages.
+ * Check entries in tuple and set up entryRes array.
+ *
+ * If pending tuples belonging to the current heap row are spread
+ * across several pages, collectMatchesForHeapRow will read all of
+ * those pages.
*/
- if (!collectDatumForItem(scan, &pos))
+ if (!collectMatchesForHeapRow(scan, &pos))
continue;
/*
- * Matching of entries of one row is finished, so check row by
- * consistent function.
+ * Matching of entries of one row is finished, so check row using
+ * consistent functions.
*/
oldCtx = MemoryContextSwitchTo(so->tempCtx);
recheck = false;
@@ -1114,21 +1480,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
{
GinScanKey key = so->keys + i;
- keyrecheck = true;
-
- if (!DatumGetBool(FunctionCall6(&so->ginstate.consistentFn[key->attnum - 1],
- PointerGetDatum(key->entryRes),
- UInt16GetDatum(key->strategy),
- key->query,
- UInt32GetDatum(key->nentries),
- PointerGetDatum(key->extra_data),
- PointerGetDatum(&keyrecheck))))
+ if (!callConsistentFn(&so->ginstate, key))
{
match = false;
break;
}
-
- recheck |= keyrecheck;
+ recheck |= key->recheckCurItem;
}
MemoryContextSwitchTo(oldCtx);
@@ -1144,84 +1501,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
pfree(pos.hasMatchKey);
}
-/*
- * Get heap item pointer from scan
- * returns true if found
- */
-static bool
-scanGetItem(IndexScanDesc scan, ItemPointerData *item, bool *recheck)
-{
- GinScanOpaque so = (GinScanOpaque) scan->opaque;
- uint32 i;
- bool keyrecheck;
-
- /*
- * We return recheck = true if any of the keyGetItem calls return
- * keyrecheck = true. Note that because the second loop might advance
- * some keys, this could theoretically be too conservative. In practice
- * though, we expect that a consistentFn's recheck result will depend only
- * on the operator and the query, so for any one key it should stay the
- * same regardless of advancing to new items. So it's not worth working
- * harder.
- */
- *recheck = false;
-
- ItemPointerSetMin(item);
- for (i = 0; i < so->nkeys; i++)
- {
- GinScanKey key = so->keys + i;
-
- if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx,
- key, &keyrecheck))
- return FALSE; /* finished one of keys */
- if (compareItemPointers(item, &key->curItem) < 0)
- *item = key->curItem;
- *recheck |= keyrecheck;
- }
-
- for (i = 1; i <= so->nkeys; i++)
- {
- GinScanKey key = so->keys + i - 1;
-
- for (;;)
- {
- int cmp = compareItemPointers(item, &key->curItem);
-
- if (cmp != 0 && (ItemPointerIsLossyPage(item) || ItemPointerIsLossyPage(&key->curItem)))
- {
- /*
- * if one of ItemPointers points to the whole page then
- * compare only page's number
- */
- if (ItemPointerGetBlockNumber(item) == ItemPointerGetBlockNumber(&key->curItem))
- cmp = 0;
- else
- cmp = (ItemPointerGetBlockNumber(item) > ItemPointerGetBlockNumber(&key->curItem)) ? 1 : -1;
- }
-
- if (cmp == 0)
- break;
- else if (cmp > 0)
- {
- if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx,
- key, &keyrecheck))
- return FALSE; /* finished one of keys */
- *recheck |= keyrecheck;
- }
- else
- { /* returns to begin */
- *item = key->curItem;
- i = 0;
- break;
- }
- }
- }
-
- return TRUE;
-}
#define GinIsNewKey(s) ( ((GinScanOpaque) scan->opaque)->keys == NULL )
-#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes == true )
+#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
Datum
gingetbitmap(PG_FUNCTION_ARGS)
@@ -1229,9 +1511,14 @@ gingetbitmap(PG_FUNCTION_ARGS)
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
int64 ntids;
+ ItemPointerData iptr;
+ bool recheck;
+ /*
+ * Set up the scan keys, and check for unsatisfiable query.
+ */
if (GinIsNewKey(scan))
- newScanKey(scan);
+ ginNewScanKey(scan);
if (GinIsVoidRes(scan))
PG_RETURN_INT64(0);
@@ -1255,14 +1542,13 @@ gingetbitmap(PG_FUNCTION_ARGS)
*/
startScan(scan);
+ ItemPointerSetMin(&iptr);
+
for (;;)
{
- ItemPointerData iptr;
- bool recheck;
-
CHECK_FOR_INTERRUPTS();
- if (!scanGetItem(scan, &iptr, &recheck))
+ if (!scanGetItem(scan, &iptr, &iptr, &recheck))
break;
if (ItemPointerIsLossyPage(&iptr))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index e2a5e8b013..3e32af94a9 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -4,21 +4,21 @@
* insert routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.26 2010/02/11 14:29:50 teodor Exp $
+ * src/backend/access/gin/gininsert.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/genam.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "catalog/index.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
+#include "storage/smgr.h"
#include "storage/indexfsm.h"
#include "utils/memutils.h"
@@ -27,14 +27,17 @@ typedef struct
{
GinState ginstate;
double indtuples;
+ GinStatsData buildStats;
MemoryContext tmpCtx;
MemoryContext funcCtx;
BuildAccumulator accum;
} GinBuildState;
/*
- * Creates posting tree with one page. Function
- * suppose that items[] fits to page
+ * Creates new posting tree with one page, containing the given TIDs.
+ * Returns the page number (which will be the root of this posting tree).
+ *
+ * items[] must be in sorted order with no duplicates.
*/
static BlockNumber
createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
@@ -43,6 +46,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
Buffer buffer = GinNewBuffer(index);
Page page;
+ /* Assert that the items[] array will fit on one page */
+ Assert(nitems <= GinMaxLeafDataItems);
+
START_CRIT_SECTION();
GinInitBuffer(buffer, GIN_DATA | GIN_LEAF);
@@ -54,7 +60,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
MarkBufferDirty(buffer);
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
XLogRecData rdata[2];
@@ -74,12 +80,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
rdata[1].len = sizeof(ItemPointerData) * nitems;
rdata[1].next = NULL;
-
-
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
-
}
UnlockReleaseBuffer(buffer);
@@ -91,143 +94,245 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
/*
- * Adds array of item pointers to tuple's posting list or
- * creates posting tree and tuple pointed to tree in a case
+ * Adds array of item pointers to tuple's posting list, or
+ * creates posting tree and tuple pointing to tree in case
* of not enough space. Max size of tuple is defined in
- * GinFormTuple().
+ * GinFormTuple(). Returns a new, modified index tuple.
+ * items[] must be in sorted order with no duplicates.
*/
static IndexTuple
-addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack,
- IndexTuple old, ItemPointerData *items, uint32 nitem, bool isBuild)
+addItemPointersToLeafTuple(GinState *ginstate,
+ IndexTuple old,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats)
{
- Datum key = gin_index_getattr(ginstate, old);
- OffsetNumber attnum = gintuple_get_attrnum(ginstate, old);
- IndexTuple res = GinFormTuple(index, ginstate, attnum, key,
- NULL, nitem + GinGetNPosting(old),
- false);
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+ IndexTuple res;
+
+ Assert(!GinIsPostingTree(old));
+
+ attnum = gintuple_get_attrnum(ginstate, old);
+ key = gintuple_get_key(ginstate, old, &category);
+
+ /* try to build tuple with room for all the items */
+ res = GinFormTuple(ginstate, attnum, key, category,
+ NULL, nitem + GinGetNPosting(old),
+ false);
if (res)
{
/* good, small enough */
uint32 newnitem;
- newnitem = MergeItemPointers(GinGetPosting(res),
- GinGetPosting(old), GinGetNPosting(old),
- items, nitem);
+ /* fill in the posting list with union of old and new TIDs */
+ newnitem = ginMergeItemPointers(GinGetPosting(res),
+ GinGetPosting(old),
+ GinGetNPosting(old),
+ items, nitem);
/* merge might have eliminated some duplicate items */
GinShortenTuple(res, newnitem);
}
else
{
+ /* posting list would be too big, convert to posting tree */
BlockNumber postingRoot;
GinPostingTreeScan *gdi;
- /* posting list becomes big, so we need to make posting's tree */
- res = GinFormTuple(index, ginstate, attnum, key, NULL, 0, true);
- postingRoot = createPostingTree(index, GinGetPosting(old), GinGetNPosting(old));
- GinSetPostingTree(res, postingRoot);
+ /*
+ * Initialize posting tree with the old tuple's posting list. It's
+ * surely small enough to fit on one posting-tree page, and should
+ * already be in order with no duplicates.
+ */
+ postingRoot = createPostingTree(ginstate->index,
+ GinGetPosting(old),
+ GinGetNPosting(old));
- gdi = prepareScanPostingTree(index, postingRoot, FALSE);
- gdi->btree.isBuild = isBuild;
+ /* During index build, count the newly-added data page */
+ if (buildStats)
+ buildStats->nDataPages++;
- insertItemPointer(gdi, items, nitem);
+ /* Now insert the TIDs-to-be-added into the posting tree */
+ gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE);
+ gdi->btree.isBuild = (buildStats != NULL);
+
+ ginInsertItemPointers(gdi, items, nitem, buildStats);
pfree(gdi);
+
+ /* And build a new posting-tree-only result tuple */
+ res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true);
+ GinSetPostingTree(res, postingRoot);
+ }
+
+ return res;
+}
+
+/*
+ * Build a fresh leaf tuple, either posting-list or posting-tree format
+ * depending on whether the given items list will fit.
+ * items[] must be in sorted order with no duplicates.
+ *
+ * This is basically the same logic as in addItemPointersToLeafTuple,
+ * but working from slightly different input.
+ */
+static IndexTuple
+buildFreshLeafTuple(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats)
+{
+ IndexTuple res;
+
+ /* try to build tuple with room for all the items */
+ res = GinFormTuple(ginstate, attnum, key, category,
+ items, nitem, false);
+
+ if (!res)
+ {
+ /* posting list would be too big, build posting tree */
+ BlockNumber postingRoot;
+
+ /*
+ * Build posting-tree-only result tuple. We do this first so as to
+ * fail quickly if the key is too big.
+ */
+ res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true);
+
+ /*
+ * Initialize posting tree with as many TIDs as will fit on the first
+ * page.
+ */
+ postingRoot = createPostingTree(ginstate->index,
+ items,
+ Min(nitem, GinMaxLeafDataItems));
+
+ /* During index build, count the newly-added data page */
+ if (buildStats)
+ buildStats->nDataPages++;
+
+ /* Add any remaining TIDs to the posting tree */
+ if (nitem > GinMaxLeafDataItems)
+ {
+ GinPostingTreeScan *gdi;
+
+ gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE);
+ gdi->btree.isBuild = (buildStats != NULL);
+
+ ginInsertItemPointers(gdi,
+ items + GinMaxLeafDataItems,
+ nitem - GinMaxLeafDataItems,
+ buildStats);
+
+ pfree(gdi);
+ }
+
+ /* And save the root link in the result tuple */
+ GinSetPostingTree(res, postingRoot);
}
return res;
}
/*
- * Inserts only one entry to the index, but it can add more than 1 ItemPointer.
+ * Insert one or more heap TIDs associated with the given key value.
+ * This will either add a single key entry, or enlarge a pre-existing entry.
+ *
+ * During an index build, buildStats is non-null and the counters
+ * it contains should be incremented as needed.
*/
void
-ginEntryInsert(Relation index, GinState *ginstate,
- OffsetNumber attnum, Datum value,
+ginEntryInsert(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
ItemPointerData *items, uint32 nitem,
- bool isBuild)
+ GinStatsData *buildStats)
{
GinBtreeData btree;
GinBtreeStack *stack;
IndexTuple itup;
Page page;
- prepareEntryScan(&btree, index, attnum, value, ginstate);
+ /* During index build, count the to-be-inserted entry */
+ if (buildStats)
+ buildStats->nEntries++;
+
+ ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
stack = ginFindLeafPage(&btree, NULL);
page = BufferGetPage(stack->buffer);
if (btree.findItem(&btree, stack))
{
- /* found entry */
+ /* found pre-existing entry */
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
if (GinIsPostingTree(itup))
{
- /* lock root of posting tree */
- GinPostingTreeScan *gdi;
+ /* add entries to existing posting tree */
BlockNumber rootPostingTree = GinGetPostingTree(itup);
+ GinPostingTreeScan *gdi;
/* release all stack */
LockBuffer(stack->buffer, GIN_UNLOCK);
freeGinBtreeStack(stack);
/* insert into posting tree */
- gdi = prepareScanPostingTree(index, rootPostingTree, FALSE);
- gdi->btree.isBuild = isBuild;
- insertItemPointer(gdi, items, nitem);
+ gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, FALSE);
+ gdi->btree.isBuild = (buildStats != NULL);
+ ginInsertItemPointers(gdi, items, nitem, buildStats);
+ pfree(gdi);
return;
}
- itup = addItemPointersToTuple(index, ginstate, stack, itup, items, nitem, isBuild);
+ /* modify an existing leaf entry */
+ itup = addItemPointersToLeafTuple(ginstate, itup,
+ items, nitem, buildStats);
btree.isDelete = TRUE;
}
else
{
- /* We suppose that tuple can store at least one itempointer */
- itup = GinFormTuple(index, ginstate, attnum, value, items, 1, true);
-
- if (nitem > 1)
- {
- /* Add the rest, making a posting tree if necessary */
- IndexTuple previtup = itup;
-
- itup = addItemPointersToTuple(index, ginstate, stack, previtup, items + 1, nitem - 1, isBuild);
- pfree(previtup);
- }
+ /* no match, so construct a new leaf entry */
+ itup = buildFreshLeafTuple(ginstate, attnum, key, category,
+ items, nitem, buildStats);
}
+ /* Insert the new or modified leaf tuple */
btree.entry = itup;
- ginInsertValue(&btree, stack);
+ ginInsertValue(&btree, stack, buildStats);
pfree(itup);
}
/*
- * Saves indexed value in memory accumulator during index creation
- * Function isn't used during normal insert
+ * Extract index entries for a single indexable item, and add them to the
+ * BuildAccumulator's state.
+ *
+ * This function is used only during initial index creation.
*/
-static uint32
-ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, Datum value, ItemPointer heapptr)
+static void
+ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ ItemPointer heapptr)
{
Datum *entries;
+ GinNullCategory *categories;
int32 nentries;
MemoryContext oldCtx;
oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
- entries = extractEntriesSU(buildstate->accum.ginstate, attnum, value, &nentries);
+ entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
+ value, isNull,
+ &nentries, &categories);
MemoryContextSwitchTo(oldCtx);
- if (nentries == 0)
- /* nothing to insert */
- return 0;
+ ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
+ entries, categories, nentries);
- ginInsertRecordBA(&buildstate->accum, heapptr, attnum, entries, nentries);
+ buildstate->indtuples += nentries;
MemoryContextReset(buildstate->funcCtx);
-
- return nentries;
}
static void
@@ -241,24 +346,27 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values,
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
- if (!isnull[i])
- buildstate->indtuples += ginHeapTupleBulkInsert(buildstate,
- (OffsetNumber) (i + 1), values[i],
- &htup->t_self);
+ ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
+ values[i], isnull[i],
+ &htup->t_self);
/* If we've maxed out our available memory, dump everything to the index */
if (buildstate->accum.allocatedMemory >= maintenance_work_mem * 1024L)
{
ItemPointerData *list;
- Datum entry;
+ Datum key;
+ GinNullCategory category;
uint32 nlist;
OffsetNumber attnum;
- while ((list = ginGetEntry(&buildstate->accum, &attnum, &entry, &nlist)) != NULL)
+ ginBeginBAScan(&buildstate->accum);
+ while ((list = ginGetBAEntry(&buildstate->accum,
+ &attnum, &key, &category, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
CHECK_FOR_INTERRUPTS();
- ginEntryInsert(index, &buildstate->ginstate, attnum, entry, list, nlist, TRUE);
+ ginEntryInsert(&buildstate->ginstate, attnum, key, category,
+ list, nlist, &buildstate->buildStats);
}
MemoryContextReset(buildstate->tmpCtx);
@@ -280,7 +388,8 @@ ginbuild(PG_FUNCTION_ARGS)
Buffer RootBuffer,
MetaBuffer;
ItemPointerData *list;
- Datum entry;
+ Datum key;
+ GinNullCategory category;
uint32 nlist;
MemoryContext oldCtx;
OffsetNumber attnum;
@@ -290,6 +399,8 @@ ginbuild(PG_FUNCTION_ARGS)
RelationGetRelationName(index));
initGinState(&buildstate.ginstate, index);
+ buildstate.indtuples = 0;
+ memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
/* initialize the meta page */
MetaBuffer = GinNewBuffer(index);
@@ -303,7 +414,7 @@ ginbuild(PG_FUNCTION_ARGS)
GinInitBuffer(RootBuffer, GIN_LEAF);
MarkBufferDirty(RootBuffer);
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
XLogRecData rdata;
@@ -329,8 +440,8 @@ ginbuild(PG_FUNCTION_ARGS)
UnlockReleaseBuffer(RootBuffer);
END_CRIT_SECTION();
- /* build the index */
- buildstate.indtuples = 0;
+ /* count the root as first entry page */
+ buildstate.buildStats.nEntryPages++;
/*
* create a temporary memory context that is reset once for each tuple
@@ -360,17 +471,26 @@ ginbuild(PG_FUNCTION_ARGS)
/* dump remaining entries to the index */
oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
- while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL)
+ ginBeginBAScan(&buildstate.accum);
+ while ((list = ginGetBAEntry(&buildstate.accum,
+ &attnum, &key, &category, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
CHECK_FOR_INTERRUPTS();
- ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE);
+ ginEntryInsert(&buildstate.ginstate, attnum, key, category,
+ list, nlist, &buildstate.buildStats);
}
MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(buildstate.tmpCtx);
/*
+ * Update metapage stats
+ */
+ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
+ ginUpdateStats(index, &buildstate.buildStats);
+
+ /*
* Return statistics
*/
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
@@ -382,25 +502,66 @@ ginbuild(PG_FUNCTION_ARGS)
}
/*
- * Inserts value during normal insertion
+ * ginbuildempty() -- build an empty gin index in the initialization fork
*/
-static uint32
-ginHeapTupleInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datum value, ItemPointer item)
+Datum
+ginbuildempty(PG_FUNCTION_ARGS)
+{
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Buffer RootBuffer,
+ MetaBuffer;
+
+ /* An empty GIN index has two pages. */
+ MetaBuffer =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE);
+ RootBuffer =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /* Initialize both pages, mark them dirty, unlock and release buffer. */
+ START_CRIT_SECTION();
+ GinInitMetabuffer(MetaBuffer);
+ MarkBufferDirty(MetaBuffer);
+ GinInitBuffer(RootBuffer, GIN_LEAF);
+ MarkBufferDirty(RootBuffer);
+
+ /* XLOG the new pages */
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BufferGetBlockNumber(MetaBuffer),
+ BufferGetPage(MetaBuffer));
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BufferGetBlockNumber(RootBuffer),
+ BufferGetPage(RootBuffer));
+ END_CRIT_SECTION();
+
+ /* Unlock and release the buffers. */
+ UnlockReleaseBuffer(MetaBuffer);
+ UnlockReleaseBuffer(RootBuffer);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Insert index entries for a single indexable item during "normal"
+ * (non-fast-update) insertion
+ */
+static void
+ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ ItemPointer item)
{
Datum *entries;
+ GinNullCategory *categories;
int32 i,
nentries;
- entries = extractEntriesSU(ginstate, attnum, value, &nentries);
-
- if (nentries == 0)
- /* nothing to insert */
- return 0;
+ entries = ginExtractEntries(ginstate, attnum, value, isNull,
+ &nentries, &categories);
for (i = 0; i < nentries; i++)
- ginEntryInsert(index, ginstate, attnum, entries[i], item, 1, FALSE);
-
- return nentries;
+ ginEntryInsert(ginstate, attnum, entries[i], categories[i],
+ item, 1, NULL);
}
Datum
@@ -435,20 +596,21 @@ gininsert(PG_FUNCTION_ARGS)
GinTupleCollector collector;
memset(&collector, 0, sizeof(GinTupleCollector));
+
for (i = 0; i < ginstate.origTupdesc->natts; i++)
- if (!isnull[i])
- ginHeapTupleFastCollect(index, &ginstate, &collector,
- (OffsetNumber) (i + 1), values[i], ht_ctid);
+ ginHeapTupleFastCollect(&ginstate, &collector,
+ (OffsetNumber) (i + 1),
+ values[i], isnull[i],
+ ht_ctid);
- ginHeapTupleFastInsert(index, &ginstate, &collector);
+ ginHeapTupleFastInsert(&ginstate, &collector);
}
else
{
for (i = 0; i < ginstate.origTupdesc->natts; i++)
- if (!isnull[i])
- ginHeapTupleInsert(index, &ginstate,
- (OffsetNumber) (i + 1), values[i], ht_ctid);
-
+ ginHeapTupleInsert(&ginstate, (OffsetNumber) (i + 1),
+ values[i], isnull[i],
+ ht_ctid);
}
MemoryContextSwitchTo(oldCtx);
diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c
index 1aa4ea9082..f8d54b1b46 100644
--- a/src/backend/access/gin/ginscan.c
+++ b/src/backend/access/gin/ginscan.c
@@ -1,20 +1,20 @@
/*-------------------------------------------------------------------------
*
* ginscan.c
- * routines to manage scans inverted index relations
+ * routines to manage scans of inverted index relations
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.26 2010/01/18 11:50:43 teodor Exp $
+ * src/backend/access/gin/ginscan.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "access/relscan.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
@@ -26,160 +26,276 @@ Datum
ginbeginscan(PG_FUNCTION_ARGS)
{
Relation rel = (Relation) PG_GETARG_POINTER(0);
- int keysz = PG_GETARG_INT32(1);
- ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
+ int nkeys = PG_GETARG_INT32(1);
+ int norderbys = PG_GETARG_INT32(2);
IndexScanDesc scan;
+ GinScanOpaque so;
+
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
+
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
- scan = RelationGetIndexScan(rel, keysz, scankey);
+ /* allocate private workspace */
+ so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData));
+ so->keys = NULL;
+ so->nkeys = 0;
+ so->tempCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin scan temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ initGinState(&so->ginstate, scan->indexRelation);
+
+ scan->opaque = so;
PG_RETURN_POINTER(scan);
}
-static void
-fillScanKey(GinState *ginstate, GinScanKey key, OffsetNumber attnum, Datum query,
- Datum *entryValues, bool *partial_matches, uint32 nEntryValues,
- StrategyNumber strategy, Pointer *extra_data)
+/*
+ * Create a new GinScanEntry, unless an equivalent one already exists,
+ * in which case just return it
+ */
+static GinScanEntry
+ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum,
+ StrategyNumber strategy, int32 searchMode,
+ Datum queryKey, GinNullCategory queryCategory,
+ bool isPartialMatch, Pointer extra_data)
{
- uint32 i,
- j;
-
- key->nentries = nEntryValues;
- key->entryRes = (bool *) palloc0(sizeof(bool) * nEntryValues);
- key->scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData) * nEntryValues);
- key->strategy = strategy;
- key->attnum = attnum;
- key->extra_data = extra_data;
- key->query = query;
- key->firstCall = TRUE;
- ItemPointerSet(&(key->curItem), InvalidBlockNumber, InvalidOffsetNumber);
-
- for (i = 0; i < nEntryValues; i++)
+ GinState *ginstate = &so->ginstate;
+ GinScanEntry scanEntry;
+ uint32 i;
+
+ /*
+ * Look for an existing equivalent entry.
+ *
+ * Entries with non-null extra_data are never considered identical, since
+ * we can't know exactly what the opclass might be doing with that.
+ */
+ if (extra_data == NULL)
{
- key->scanEntry[i].pval = key->entryRes + i;
- key->scanEntry[i].entry = entryValues[i];
- key->scanEntry[i].attnum = attnum;
- key->scanEntry[i].extra_data = (extra_data) ? extra_data[i] : NULL;
- ItemPointerSet(&(key->scanEntry[i].curItem), InvalidBlockNumber, InvalidOffsetNumber);
- key->scanEntry[i].offset = InvalidOffsetNumber;
- key->scanEntry[i].buffer = InvalidBuffer;
- key->scanEntry[i].partialMatch = NULL;
- key->scanEntry[i].partialMatchIterator = NULL;
- key->scanEntry[i].partialMatchResult = NULL;
- key->scanEntry[i].strategy = strategy;
- key->scanEntry[i].list = NULL;
- key->scanEntry[i].nlist = 0;
- key->scanEntry[i].isPartialMatch = (ginstate->canPartialMatch[attnum - 1] && partial_matches)
- ? partial_matches[i] : false;
-
- /* link to the equals entry in current scan key */
- key->scanEntry[i].master = NULL;
- for (j = 0; j < i; j++)
- if (compareEntries(ginstate, attnum, entryValues[i], entryValues[j]) == 0 &&
- key->scanEntry[i].isPartialMatch == key->scanEntry[j].isPartialMatch &&
- key->scanEntry[i].strategy == key->scanEntry[j].strategy)
+ for (i = 0; i < so->totalentries; i++)
+ {
+ GinScanEntry prevEntry = so->entries[i];
+
+ if (prevEntry->extra_data == NULL &&
+ prevEntry->isPartialMatch == isPartialMatch &&
+ prevEntry->strategy == strategy &&
+ prevEntry->searchMode == searchMode &&
+ prevEntry->attnum == attnum &&
+ ginCompareEntries(ginstate, attnum,
+ prevEntry->queryKey,
+ prevEntry->queryCategory,
+ queryKey,
+ queryCategory) == 0)
{
- key->scanEntry[i].master = key->scanEntry + j;
- break;
+ /* Successful match */
+ return prevEntry;
}
+ }
}
-}
-#ifdef NOT_USED
+ /* Nope, create a new entry */
+ scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData));
+ scanEntry->queryKey = queryKey;
+ scanEntry->queryCategory = queryCategory;
+ scanEntry->isPartialMatch = isPartialMatch;
+ scanEntry->extra_data = extra_data;
+ scanEntry->strategy = strategy;
+ scanEntry->searchMode = searchMode;
+ scanEntry->attnum = attnum;
+
+ scanEntry->buffer = InvalidBuffer;
+ ItemPointerSetMin(&scanEntry->curItem);
+ scanEntry->matchBitmap = NULL;
+ scanEntry->matchIterator = NULL;
+ scanEntry->matchResult = NULL;
+ scanEntry->list = NULL;
+ scanEntry->nlist = 0;
+ scanEntry->offset = InvalidOffsetNumber;
+ scanEntry->isFinished = false;
+ scanEntry->reduceResult = false;
+
+ /* Add it to so's array */
+ if (so->totalentries >= so->allocentries)
+ {
+ so->allocentries *= 2;
+ so->entries = (GinScanEntry *)
+ repalloc(so->entries, so->allocentries * sizeof(GinScanEntry));
+ }
+ so->entries[so->totalentries++] = scanEntry;
+
+ return scanEntry;
+}
+/*
+ * Initialize the next GinScanKey using the output from the extractQueryFn
+ */
static void
-resetScanKeys(GinScanKey keys, uint32 nkeys)
+ginFillScanKey(GinScanOpaque so, OffsetNumber attnum,
+ StrategyNumber strategy, int32 searchMode,
+ Datum query, uint32 nQueryValues,
+ Datum *queryValues, GinNullCategory *queryCategories,
+ bool *partial_matches, Pointer *extra_data)
{
- uint32 i,
- j;
+ GinScanKey key = &(so->keys[so->nkeys++]);
+ GinState *ginstate = &so->ginstate;
+ uint32 nUserQueryValues = nQueryValues;
+ uint32 i;
- if (keys == NULL)
- return;
+ /* Non-default search modes add one "hidden" entry to each key */
+ if (searchMode != GIN_SEARCH_MODE_DEFAULT)
+ nQueryValues++;
+ key->nentries = nQueryValues;
+ key->nuserentries = nUserQueryValues;
- for (i = 0; i < nkeys; i++)
- {
- GinScanKey key = keys + i;
+ key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * nQueryValues);
+ key->entryRes = (bool *) palloc0(sizeof(bool) * nQueryValues);
- key->firstCall = TRUE;
- ItemPointerSet(&(key->curItem), InvalidBlockNumber, InvalidOffsetNumber);
+ key->query = query;
+ key->queryValues = queryValues;
+ key->queryCategories = queryCategories;
+ key->extra_data = extra_data;
+ key->strategy = strategy;
+ key->searchMode = searchMode;
+ key->attnum = attnum;
+
+ ItemPointerSetMin(&key->curItem);
+ key->curItemMatches = false;
+ key->recheckCurItem = false;
+ key->isFinished = false;
- for (j = 0; j < key->nentries; j++)
+ for (i = 0; i < nQueryValues; i++)
+ {
+ Datum queryKey;
+ GinNullCategory queryCategory;
+ bool isPartialMatch;
+ Pointer this_extra;
+
+ if (i < nUserQueryValues)
{
- if (key->scanEntry[j].buffer != InvalidBuffer)
- ReleaseBuffer(key->scanEntry[i].buffer);
-
- ItemPointerSet(&(key->scanEntry[j].curItem), InvalidBlockNumber, InvalidOffsetNumber);
- key->scanEntry[j].offset = InvalidOffsetNumber;
- key->scanEntry[j].buffer = InvalidBuffer;
- key->scanEntry[j].list = NULL;
- key->scanEntry[j].nlist = 0;
- key->scanEntry[j].partialMatch = NULL;
- key->scanEntry[j].partialMatchIterator = NULL;
- key->scanEntry[j].partialMatchResult = NULL;
+ /* set up normal entry using extractQueryFn's outputs */
+ queryKey = queryValues[i];
+ queryCategory = queryCategories[i];
+ isPartialMatch =
+ (ginstate->canPartialMatch[attnum - 1] && partial_matches)
+ ? partial_matches[i] : false;
+ this_extra = (extra_data) ? extra_data[i] : NULL;
}
+ else
+ {
+ /* set up hidden entry */
+ queryKey = (Datum) 0;
+ switch (searchMode)
+ {
+ case GIN_SEARCH_MODE_INCLUDE_EMPTY:
+ queryCategory = GIN_CAT_EMPTY_ITEM;
+ break;
+ case GIN_SEARCH_MODE_ALL:
+ queryCategory = GIN_CAT_EMPTY_QUERY;
+ break;
+ case GIN_SEARCH_MODE_EVERYTHING:
+ queryCategory = GIN_CAT_EMPTY_QUERY;
+ break;
+ default:
+ elog(ERROR, "unexpected searchMode: %d", searchMode);
+ queryCategory = 0; /* keep compiler quiet */
+ break;
+ }
+ isPartialMatch = false;
+ this_extra = NULL;
+
+ /*
+ * We set the strategy to a fixed value so that ginFillScanEntry
+ * can combine these entries for different scan keys. This is
+ * safe because the strategy value in the entry struct is only
+ * used for partial-match cases. It's OK to overwrite our local
+ * variable here because this is the last loop iteration.
+ */
+ strategy = InvalidStrategy;
+ }
+
+ key->scanEntry[i] = ginFillScanEntry(so, attnum,
+ strategy, searchMode,
+ queryKey, queryCategory,
+ isPartialMatch, this_extra);
}
}
-#endif
static void
-freeScanKeys(GinScanKey keys, uint32 nkeys)
+freeScanKeys(GinScanOpaque so)
{
- uint32 i,
- j;
+ uint32 i;
- if (keys == NULL)
+ if (so->keys == NULL)
return;
- for (i = 0; i < nkeys; i++)
+ for (i = 0; i < so->nkeys; i++)
{
- GinScanKey key = keys + i;
-
- for (j = 0; j < key->nentries; j++)
- {
- if (key->scanEntry[j].buffer != InvalidBuffer)
- ReleaseBuffer(key->scanEntry[j].buffer);
- if (key->scanEntry[j].list)
- pfree(key->scanEntry[j].list);
- if (key->scanEntry[j].partialMatchIterator)
- tbm_end_iterate(key->scanEntry[j].partialMatchIterator);
- if (key->scanEntry[j].partialMatch)
- tbm_free(key->scanEntry[j].partialMatch);
- }
+ GinScanKey key = so->keys + i;
- pfree(key->entryRes);
pfree(key->scanEntry);
+ pfree(key->entryRes);
}
- pfree(keys);
+ pfree(so->keys);
+ so->keys = NULL;
+ so->nkeys = 0;
+
+ for (i = 0; i < so->totalentries; i++)
+ {
+ GinScanEntry entry = so->entries[i];
+
+ if (entry->buffer != InvalidBuffer)
+ ReleaseBuffer(entry->buffer);
+ if (entry->list)
+ pfree(entry->list);
+ if (entry->matchIterator)
+ tbm_end_iterate(entry->matchIterator);
+ if (entry->matchBitmap)
+ tbm_free(entry->matchBitmap);
+ pfree(entry);
+ }
+
+ pfree(so->entries);
+ so->entries = NULL;
+ so->totalentries = 0;
}
void
-newScanKey(IndexScanDesc scan)
+ginNewScanKey(IndexScanDesc scan)
{
ScanKey scankey = scan->keyData;
GinScanOpaque so = (GinScanOpaque) scan->opaque;
int i;
- uint32 nkeys = 0;
+ bool hasNullQuery = false;
- if (scan->numberOfKeys < 1)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("GIN indexes do not support whole-index scans")));
+ /* if no scan keys provided, allocate extra EVERYTHING GinScanKey */
+ so->keys = (GinScanKey)
+ palloc(Max(scan->numberOfKeys, 1) * sizeof(GinScanKeyData));
+ so->nkeys = 0;
- so->keys = (GinScanKey) palloc(scan->numberOfKeys * sizeof(GinScanKeyData));
+ /* initialize expansible array of GinScanEntry pointers */
+ so->totalentries = 0;
+ so->allocentries = 32;
+ so->entries = (GinScanEntry *)
+ palloc0(so->allocentries * sizeof(GinScanEntry));
so->isVoidRes = false;
for (i = 0; i < scan->numberOfKeys; i++)
{
ScanKey skey = &scankey[i];
- Datum *entryValues;
- int32 nEntryValues = 0;
+ Datum *queryValues;
+ int32 nQueryValues = 0;
bool *partial_matches = NULL;
Pointer *extra_data = NULL;
+ bool *nullFlags = NULL;
+ int32 searchMode = GIN_SEARCH_MODE_DEFAULT;
/*
- * Assume, that GIN-indexable operators are strict, so nothing could
- * be found
+ * We assume that GIN-indexable operators are strict, so a null query
+ * argument means an unsatisfiable query.
*/
if (skey->sk_flags & SK_ISNULL)
{
@@ -187,48 +303,105 @@ newScanKey(IndexScanDesc scan)
break;
}
- entryValues = (Datum *)
- DatumGetPointer(FunctionCall5(&so->ginstate.extractQueryFn[skey->sk_attno - 1],
- skey->sk_argument,
- PointerGetDatum(&nEntryValues),
- UInt16GetDatum(skey->sk_strategy),
- PointerGetDatum(&partial_matches),
- PointerGetDatum(&extra_data)));
+ /* OK to call the extractQueryFn */
+ queryValues = (Datum *)
+ DatumGetPointer(FunctionCall7Coll(&so->ginstate.extractQueryFn[skey->sk_attno - 1],
+ so->ginstate.supportCollation[skey->sk_attno - 1],
+ skey->sk_argument,
+ PointerGetDatum(&nQueryValues),
+ UInt16GetDatum(skey->sk_strategy),
+ PointerGetDatum(&partial_matches),
+ PointerGetDatum(&extra_data),
+ PointerGetDatum(&nullFlags),
+ PointerGetDatum(&searchMode)));
+
+ /*
+ * If bogus searchMode is returned, treat as GIN_SEARCH_MODE_ALL; note
+ * in particular we don't allow extractQueryFn to select
+ * GIN_SEARCH_MODE_EVERYTHING.
+ */
+ if (searchMode < GIN_SEARCH_MODE_DEFAULT ||
+ searchMode > GIN_SEARCH_MODE_ALL)
+ searchMode = GIN_SEARCH_MODE_ALL;
+
+ /* Non-default modes require the index to have placeholders */
+ if (searchMode != GIN_SEARCH_MODE_DEFAULT)
+ hasNullQuery = true;
- if (nEntryValues < 0)
+ /*
+ * In default mode, no keys means an unsatisfiable query.
+ */
+ if (queryValues == NULL || nQueryValues <= 0)
{
- /*
- * extractQueryFn signals that nothing can match, so we can just
- * set isVoidRes flag. No need to examine any more keys.
- */
- so->isVoidRes = true;
- break;
+ if (searchMode == GIN_SEARCH_MODE_DEFAULT)
+ {
+ so->isVoidRes = true;
+ break;
+ }
+ nQueryValues = 0; /* ensure sane value */
}
- if (entryValues == NULL || nEntryValues == 0)
+ /*
+ * If the extractQueryFn didn't create a nullFlags array, create one,
+ * assuming that everything's non-null. Otherwise, run through the
+ * array and make sure each value is exactly 0 or 1; this ensures
+ * binary compatibility with the GinNullCategory representation. While
+ * at it, detect whether any null keys are present.
+ */
+ if (nullFlags == NULL)
+ nullFlags = (bool *) palloc0(nQueryValues * sizeof(bool));
+ else
{
- /*
- * extractQueryFn signals that everything matches. This would
- * require a full scan, which we can't do, but perhaps there is
- * another scankey that provides a restriction to use. So we keep
- * going and check only at the end.
- */
- continue;
+ int32 j;
+
+ for (j = 0; j < nQueryValues; j++)
+ {
+ if (nullFlags[j])
+ {
+ nullFlags[j] = true; /* not any other nonzero value */
+ hasNullQuery = true;
+ }
+ }
}
+ /* now we can use the nullFlags as category codes */
- fillScanKey(&so->ginstate, &(so->keys[nkeys]),
- skey->sk_attno, skey->sk_argument,
- entryValues, partial_matches, nEntryValues,
- skey->sk_strategy, extra_data);
- nkeys++;
+ ginFillScanKey(so, skey->sk_attno,
+ skey->sk_strategy, searchMode,
+ skey->sk_argument, nQueryValues,
+ queryValues, (GinNullCategory *) nullFlags,
+ partial_matches, extra_data);
}
- if (nkeys == 0 && !so->isVoidRes)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("GIN indexes do not support whole-index scans")));
+ /*
+ * If there are no regular scan keys, generate an EVERYTHING scankey to
+ * drive a full-index scan.
+ */
+ if (so->nkeys == 0 && !so->isVoidRes)
+ {
+ hasNullQuery = true;
+ ginFillScanKey(so, FirstOffsetNumber,
+ InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING,
+ (Datum) 0, 0,
+ NULL, NULL, NULL, NULL);
+ }
- so->nkeys = nkeys;
+ /*
+ * If the index is version 0, it may be missing null and placeholder
+ * entries, which would render searches for nulls and full-index scans
+ * unreliable. Throw an error if so.
+ */
+ if (hasNullQuery && !so->isVoidRes)
+ {
+ GinStatsData ginStats;
+
+ ginGetStats(scan->indexRelation, &ginStats);
+ if (ginStats.ginVersion < 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("old GIN indexes do not support whole-index scans nor searches for nulls"),
+ errhint("To fix this, do REINDEX INDEX \"%s\".",
+ RelationGetRelationName(scan->indexRelation))));
+ }
pgstat_count_index_scan(scan->indexRelation);
}
@@ -238,28 +411,11 @@ ginrescan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
- GinScanOpaque so;
-
- so = (GinScanOpaque) scan->opaque;
- if (so == NULL)
- {
- /* if called from ginbeginscan */
- so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData));
- so->tempCtx = AllocSetContextCreate(CurrentMemoryContext,
- "Gin scan temporary context",
- ALLOCSET_DEFAULT_MINSIZE,
- ALLOCSET_DEFAULT_INITSIZE,
- ALLOCSET_DEFAULT_MAXSIZE);
- initGinState(&so->ginstate, scan->indexRelation);
- scan->opaque = so;
- }
- else
- {
- freeScanKeys(so->keys, so->nkeys);
- }
+ /* remaining arguments are ignored */
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
- so->keys = NULL;
+ freeScanKeys(so);
if (scankey && scan->numberOfKeys > 0)
{
@@ -277,14 +433,11 @@ ginendscan(PG_FUNCTION_ARGS)
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
GinScanOpaque so = (GinScanOpaque) scan->opaque;
- if (so != NULL)
- {
- freeScanKeys(so->keys, so->nkeys);
+ freeScanKeys(so);
- MemoryContextDelete(so->tempCtx);
+ MemoryContextDelete(so->tempCtx);
- pfree(so);
- }
+ pfree(so);
PG_RETURN_VOID();
}
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 2a75820e22..ba142bc874 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -4,44 +4,61 @@
* utilities routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.23 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/gin/ginutil.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/genam.h"
-#include "access/gin.h"
+
+#include "access/gin_private.h"
#include "access/reloptions.h"
+#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
+#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+
+/*
+ * initGinState: fill in an empty GinState struct to describe the index
+ *
+ * Note: assorted subsidiary data is allocated in the CurrentMemoryContext.
+ */
void
initGinState(GinState *state, Relation index)
{
+ TupleDesc origTupdesc = RelationGetDescr(index);
int i;
- state->origTupdesc = index->rd_att;
+ MemSet(state, 0, sizeof(GinState));
- state->oneCol = (index->rd_att->natts == 1) ? true : false;
+ state->index = index;
+ state->oneCol = (origTupdesc->natts == 1) ? true : false;
+ state->origTupdesc = origTupdesc;
- for (i = 0; i < index->rd_att->natts; i++)
+ for (i = 0; i < origTupdesc->natts; i++)
{
- state->tupdesc[i] = CreateTemplateTupleDesc(2, false);
-
- TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL,
- INT2OID, -1, 0);
- TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL,
- index->rd_att->attrs[i]->atttypid,
- index->rd_att->attrs[i]->atttypmod,
- index->rd_att->attrs[i]->attndims
- );
+ if (state->oneCol)
+ state->tupdesc[i] = state->origTupdesc;
+ else
+ {
+ state->tupdesc[i] = CreateTemplateTupleDesc(2, false);
+
+ TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL,
+ INT2OID, -1, 0);
+ TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL,
+ origTupdesc->attrs[i]->atttypid,
+ origTupdesc->attrs[i]->atttypmod,
+ origTupdesc->attrs[i]->attndims);
+ TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2,
+ origTupdesc->attrs[i]->attcollation);
+ }
fmgr_info_copy(&(state->compareFn[i]),
index_getprocinfo(index, i + 1, GIN_COMPARE_PROC),
@@ -64,13 +81,29 @@ initGinState(GinState *state, Relation index)
fmgr_info_copy(&(state->comparePartialFn[i]),
index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC),
CurrentMemoryContext);
-
state->canPartialMatch[i] = true;
}
else
{
state->canPartialMatch[i] = false;
}
+
+ /*
+ * If the index column has a specified collation, we should honor that
+ * while doing comparisons. However, we may have a collatable storage
+ * type for a noncollatable indexed data type (for instance, hstore
+ * uses text index entries). If there's no index collation then
+ * specify default collation in case the support functions need
+ * collation. This is harmless if the support functions don't care
+ * about collation, so we just do it unconditionally. (We could
+ * alternatively call get_typcollation, but that seems like expensive
+ * overkill --- there aren't going to be any cases where a GIN storage
+ * type has a nondefault collation.)
+ */
+ if (OidIsValid(index->rd_indcollation[i]))
+ state->supportCollation[i] = index->rd_indcollation[i];
+ else
+ state->supportCollation[i] = DEFAULT_COLLATION_OID;
}
}
@@ -80,9 +113,14 @@ initGinState(GinState *state, Relation index)
OffsetNumber
gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple)
{
- OffsetNumber colN = FirstOffsetNumber;
+ OffsetNumber colN;
- if (!ginstate->oneCol)
+ if (ginstate->oneCol)
+ {
+ /* column number is not stored explicitly */
+ colN = FirstOffsetNumber;
+ }
+ else
{
Datum res;
bool isnull;
@@ -103,13 +141,14 @@ gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple)
}
/*
- * Extract stored datum from GIN tuple
+ * Extract stored datum (and possible null category) from GIN tuple
*/
Datum
-gin_index_getattr(GinState *ginstate, IndexTuple tuple)
+gintuple_get_key(GinState *ginstate, IndexTuple tuple,
+ GinNullCategory *category)
{
- bool isnull;
Datum res;
+ bool isnull;
if (ginstate->oneCol)
{
@@ -132,7 +171,10 @@ gin_index_getattr(GinState *ginstate, IndexTuple tuple)
&isnull);
}
- Assert(!isnull);
+ if (isnull)
+ *category = GinGetNullCategory(tuple, ginstate);
+ else
+ *category = GIN_CAT_NORM_KEY;
return res;
}
@@ -142,7 +184,6 @@ gin_index_getattr(GinState *ginstate, IndexTuple tuple)
* The returned buffer is already pinned and exclusive-locked
* Caller is responsible for initializing the page by calling GinInitBuffer
*/
-
Buffer
GinNewBuffer(Relation index)
{
@@ -227,100 +268,227 @@ GinInitMetabuffer(Buffer b)
metadata->tailFreeSize = 0;
metadata->nPendingPages = 0;
metadata->nPendingHeapTuples = 0;
+ metadata->nTotalPages = 0;
+ metadata->nEntryPages = 0;
+ metadata->nDataPages = 0;
+ metadata->nEntries = 0;
+ metadata->ginVersion = GIN_CURRENT_VERSION;
}
+/*
+ * Compare two keys of the same index column
+ */
int
-compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b)
+ginCompareEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum a, GinNullCategory categorya,
+ Datum b, GinNullCategory categoryb)
{
- return DatumGetInt32(
- FunctionCall2(
- &ginstate->compareFn[attnum - 1],
- a, b
- )
- );
+ /* if not of same null category, sort by that first */
+ if (categorya != categoryb)
+ return (categorya < categoryb) ? -1 : 1;
+
+ /* all null items in same category are equal */
+ if (categorya != GIN_CAT_NORM_KEY)
+ return 0;
+
+ /* both not null, so safe to call the compareFn */
+ return DatumGetInt32(FunctionCall2Coll(&ginstate->compareFn[attnum - 1],
+ ginstate->supportCollation[attnum - 1],
+ a, b));
}
+/*
+ * Compare two keys of possibly different index columns
+ */
int
-compareAttEntries(GinState *ginstate, OffsetNumber attnum_a, Datum a,
- OffsetNumber attnum_b, Datum b)
+ginCompareAttEntries(GinState *ginstate,
+ OffsetNumber attnuma, Datum a, GinNullCategory categorya,
+ OffsetNumber attnumb, Datum b, GinNullCategory categoryb)
{
- if (attnum_a == attnum_b)
- return compareEntries(ginstate, attnum_a, a, b);
+ /* attribute number is the first sort key */
+ if (attnuma != attnumb)
+ return (attnuma < attnumb) ? -1 : 1;
- return (attnum_a < attnum_b) ? -1 : 1;
+ return ginCompareEntries(ginstate, attnuma, a, categorya, b, categoryb);
}
+
+/*
+ * Support for sorting key datums in ginExtractEntries
+ *
+ * Note: we only have to worry about null and not-null keys here;
+ * ginExtractEntries never generates more than one placeholder null,
+ * so it doesn't have to sort those.
+ */
+typedef struct
+{
+ Datum datum;
+ bool isnull;
+} keyEntryData;
+
typedef struct
{
FmgrInfo *cmpDatumFunc;
- bool *needUnique;
-} cmpEntriesData;
+ Oid collation;
+ bool haveDups;
+} cmpEntriesArg;
static int
-cmpEntries(const Datum *a, const Datum *b, cmpEntriesData *arg)
+cmpEntries(const void *a, const void *b, void *arg)
{
- int res = DatumGetInt32(FunctionCall2(arg->cmpDatumFunc,
- *a, *b));
+ const keyEntryData *aa = (const keyEntryData *) a;
+ const keyEntryData *bb = (const keyEntryData *) b;
+ cmpEntriesArg *data = (cmpEntriesArg *) arg;
+ int res;
+ if (aa->isnull)
+ {
+ if (bb->isnull)
+ res = 0; /* NULL "=" NULL */
+ else
+ res = 1; /* NULL ">" not-NULL */
+ }
+ else if (bb->isnull)
+ res = -1; /* not-NULL "<" NULL */
+ else
+ res = DatumGetInt32(FunctionCall2Coll(data->cmpDatumFunc,
+ data->collation,
+ aa->datum, bb->datum));
+
+ /*
+ * Detect if we have any duplicates. If there are equal keys, qsort must
+ * compare them at some point, else it wouldn't know whether one should go
+ * before or after the other.
+ */
if (res == 0)
- *(arg->needUnique) = TRUE;
+ data->haveDups = true;
return res;
}
+
+/*
+ * Extract the index key values from an indexable item
+ *
+ * The resulting key values are sorted, and any duplicates are removed.
+ * This avoids generating redundant index entries.
+ */
Datum *
-extractEntriesS(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *nentries,
- bool *needUnique)
+ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ int32 *nentries, GinNullCategory **categories)
{
Datum *entries;
-
- entries = (Datum *) DatumGetPointer(FunctionCall2(
- &ginstate->extractValueFn[attnum - 1],
- value,
- PointerGetDatum(nentries)
- ));
-
- if (entries == NULL)
- *nentries = 0;
-
- *needUnique = FALSE;
- if (*nentries > 1)
+ bool *nullFlags;
+ int32 i;
+
+ /*
+ * We don't call the extractValueFn on a null item. Instead generate a
+ * placeholder.
+ */
+ if (isNull)
{
- cmpEntriesData arg;
-
- arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1];
- arg.needUnique = needUnique;
- qsort_arg(entries, *nentries, sizeof(Datum),
- (qsort_arg_comparator) cmpEntries, (void *) &arg);
+ *nentries = 1;
+ entries = (Datum *) palloc(sizeof(Datum));
+ entries[0] = (Datum) 0;
+ *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory));
+ (*categories)[0] = GIN_CAT_NULL_ITEM;
+ return entries;
}
- return entries;
-}
-
-
-Datum *
-extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *nentries)
-{
- bool needUnique;
- Datum *entries = extractEntriesS(ginstate, attnum, value, nentries,
- &needUnique);
+ /* OK, call the opclass's extractValueFn */
+ nullFlags = NULL; /* in case extractValue doesn't set it */
+ entries = (Datum *)
+ DatumGetPointer(FunctionCall3Coll(&ginstate->extractValueFn[attnum - 1],
+ ginstate->supportCollation[attnum - 1],
+ value,
+ PointerGetDatum(nentries),
+ PointerGetDatum(&nullFlags)));
+
+ /*
+ * Generate a placeholder if the item contained no keys.
+ */
+ if (entries == NULL || *nentries <= 0)
+ {
+ *nentries = 1;
+ entries = (Datum *) palloc(sizeof(Datum));
+ entries[0] = (Datum) 0;
+ *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory));
+ (*categories)[0] = GIN_CAT_EMPTY_ITEM;
+ return entries;
+ }
- if (needUnique)
+ /*
+ * If the extractValueFn didn't create a nullFlags array, create one,
+ * assuming that everything's non-null. Otherwise, run through the array
+ * and make sure each value is exactly 0 or 1; this ensures binary
+ * compatibility with the GinNullCategory representation.
+ */
+ if (nullFlags == NULL)
+ nullFlags = (bool *) palloc0(*nentries * sizeof(bool));
+ else
+ {
+ for (i = 0; i < *nentries; i++)
+ nullFlags[i] = (nullFlags[i] ? true : false);
+ }
+ /* now we can use the nullFlags as category codes */
+ *categories = (GinNullCategory *) nullFlags;
+
+ /*
+ * If there's more than one key, sort and unique-ify.
+ *
+ * XXX Using qsort here is notationally painful, and the overhead is
+ * pretty bad too. For small numbers of keys it'd likely be better to use
+ * a simple insertion sort.
+ */
+ if (*nentries > 1)
{
- Datum *ptr,
- *res;
+ keyEntryData *keydata;
+ cmpEntriesArg arg;
- ptr = res = entries;
+ keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData));
+ for (i = 0; i < *nentries; i++)
+ {
+ keydata[i].datum = entries[i];
+ keydata[i].isnull = nullFlags[i];
+ }
+
+ arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1];
+ arg.collation = ginstate->supportCollation[attnum - 1];
+ arg.haveDups = false;
+ qsort_arg(keydata, *nentries, sizeof(keyEntryData),
+ cmpEntries, (void *) &arg);
- while (ptr - entries < *nentries)
+ if (arg.haveDups)
+ {
+ /* there are duplicates, must get rid of 'em */
+ int32 j;
+
+ entries[0] = keydata[0].datum;
+ nullFlags[0] = keydata[0].isnull;
+ j = 1;
+ for (i = 1; i < *nentries; i++)
+ {
+ if (cmpEntries(&keydata[i - 1], &keydata[i], &arg) != 0)
+ {
+ entries[j] = keydata[i].datum;
+ nullFlags[j] = keydata[i].isnull;
+ j++;
+ }
+ }
+ *nentries = j;
+ }
+ else
{
- if (compareEntries(ginstate, attnum, *ptr, *res) != 0)
- *(++res) = *ptr++;
- else
- ptr++;
+ /* easy, no duplicates */
+ for (i = 0; i < *nentries; i++)
+ {
+ entries[i] = keydata[i].datum;
+ nullFlags[i] = keydata[i].isnull;
+ }
}
- *nentries = res + 1 - entries;
+ pfree(keydata);
}
return entries;
@@ -354,3 +522,83 @@ ginoptions(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(rdopts);
}
+
+/*
+ * Fetch index's statistical data into *stats
+ *
+ * Note: in the result, nPendingPages can be trusted to be up-to-date,
+ * as can ginVersion; but the other fields are as of the last VACUUM.
+ */
+void
+ginGetStats(Relation index, GinStatsData *stats)
+{
+ Buffer metabuffer;
+ Page metapage;
+ GinMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, GIN_SHARE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+
+ stats->nPendingPages = metadata->nPendingPages;
+ stats->nTotalPages = metadata->nTotalPages;
+ stats->nEntryPages = metadata->nEntryPages;
+ stats->nDataPages = metadata->nDataPages;
+ stats->nEntries = metadata->nEntries;
+ stats->ginVersion = metadata->ginVersion;
+
+ UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Write the given statistics to the index's metapage
+ *
+ * Note: nPendingPages and ginVersion are *not* copied over
+ */
+void
+ginUpdateStats(Relation index, const GinStatsData *stats)
+{
+ Buffer metabuffer;
+ Page metapage;
+ GinMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, GIN_EXCLUSIVE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+
+ START_CRIT_SECTION();
+
+ metadata->nTotalPages = stats->nTotalPages;
+ metadata->nEntryPages = stats->nEntryPages;
+ metadata->nDataPages = stats->nDataPages;
+ metadata->nEntries = stats->nEntries;
+
+ MarkBufferDirty(metabuffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+ ginxlogUpdateMeta data;
+ XLogRecData rdata;
+
+ data.node = index->rd_node;
+ data.ntuples = 0;
+ data.newRightlink = data.prevTail = InvalidBlockNumber;
+ memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
+
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) &data;
+ rdata.len = sizeof(ginxlogUpdateMeta);
+ rdata.next = NULL;
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, &rdata);
+ PageSetLSN(metapage, recptr);
+ PageSetTLI(metapage, ThisTimeLineID);
+ }
+
+ UnlockReleaseBuffer(metabuffer);
+
+ END_CRIT_SECTION();
+}
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index a13b99cdfa..79c54f16b8 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -4,18 +4,17 @@
* delete & vacuum routines for the postgres GIN
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.33 2010/02/08 04:33:52 tgl Exp $
+ * src/backend/access/gin/ginvacuum.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/genam.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "catalog/storage.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
@@ -93,7 +92,7 @@ xlogVacuumPage(Relation index, Buffer buffer)
Assert(GinPageIsLeaf(page));
- if (index->rd_istemp)
+ if (!RelationNeedsWAL(index))
return;
data.node = index->rd_node;
@@ -190,7 +189,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
/* saves changes about deleted tuple ... */
if (oldMaxOff != newMaxOff)
{
-
START_CRIT_SECTION();
if (newMaxOff > 0)
@@ -293,7 +291,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
Assert(PostingItemGetBlockNumber(tod) == deleteBlkno);
} while (0);
#endif
- PageDeletePostingItem(parentPage, myoff);
+ GinPageDeletePostingItem(parentPage, myoff);
page = BufferGetPage(dBuffer);
@@ -308,7 +306,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
MarkBufferDirty(lBuffer);
MarkBufferDirty(dBuffer);
- if (!gvs->index->rd_istemp)
+ if (RelationNeedsWAL(gvs->index))
{
XLogRecPtr recptr;
XLogRecData rdata[4];
@@ -519,7 +517,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
* store posting tree's roots for further processing, we can't
* vacuum it just now due to risk of deadlocks with scans/inserts
*/
- roots[*nroot] = GinItemPointerGetBlockNumber(&itup->t_tid);
+ roots[*nroot] = GinGetDownlink(itup);
(*nroot)++;
}
else if (GinGetNPosting(itup) > 0)
@@ -533,8 +531,9 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
if (GinGetNPosting(itup) != newN)
{
- Datum value;
OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
/*
* Some ItemPointers was deleted, so we should remake our
@@ -562,9 +561,9 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
}
- value = gin_index_getattr(&gvs->ginstate, itup);
attnum = gintuple_get_attrnum(&gvs->ginstate, itup);
- itup = GinFormTuple(gvs->index, &gvs->ginstate, attnum, value,
+ key = gintuple_get_key(&gvs->ginstate, itup, &category);
+ itup = GinFormTuple(&gvs->ginstate, attnum, key, category,
GinGetPosting(itup), newN, true);
PageIndexTupleDelete(tmppage, i);
@@ -606,7 +605,7 @@ ginbulkdelete(PG_FUNCTION_ARGS)
/* Yes, so initialize stats to zeroes */
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/* and cleanup any pending inserts */
- ginInsertCleanup(index, &gvs.ginstate, true, stats);
+ ginInsertCleanup(&gvs.ginstate, true, stats);
}
/* we'll re-count the tuples each time */
@@ -642,7 +641,7 @@ ginbulkdelete(PG_FUNCTION_ARGS)
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
- blkno = GinItemPointerGetBlockNumber(&(itup)->t_tid);
+ blkno = GinGetDownlink(itup);
Assert(blkno != InvalidBlockNumber);
UnlockReleaseBuffer(buffer);
@@ -707,9 +706,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
BlockNumber npages,
blkno;
BlockNumber totFreePages;
- BlockNumber lastBlock = GIN_ROOT_BLKNO,
- lastFilledBlock = GIN_ROOT_BLKNO;
GinState ginstate;
+ GinStatsData idxStat;
/*
* In an autovacuum analyze, we want to clean up pending insertions.
@@ -720,7 +718,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
if (IsAutoVacuumWorkerProcess())
{
initGinState(&ginstate, index);
- ginInsertCleanup(index, &ginstate, true, stats);
+ ginInsertCleanup(&ginstate, true, stats);
}
PG_RETURN_POINTER(stats);
}
@@ -733,9 +731,11 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
{
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
initGinState(&ginstate, index);
- ginInsertCleanup(index, &ginstate, true, stats);
+ ginInsertCleanup(&ginstate, true, stats);
}
+ memset(&idxStat, 0, sizeof(idxStat));
+
/*
* XXX we always report the heap tuple count as the number of index
* entries. This is bogus if the index is partial, but it's real hard to
@@ -757,7 +757,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
totFreePages = 0;
- for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++)
+ for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
{
Buffer buffer;
Page page;
@@ -771,15 +771,28 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
if (GinPageIsDeleted(page))
{
+ Assert(blkno != GIN_ROOT_BLKNO);
RecordFreeIndexPage(index, blkno);
totFreePages++;
}
- else
- lastFilledBlock = blkno;
+ else if (GinPageIsData(page))
+ {
+ idxStat.nDataPages++;
+ }
+ else if (!GinPageIsList(page))
+ {
+ idxStat.nEntryPages++;
+
+ if (GinPageIsLeaf(page))
+ idxStat.nEntries += PageGetMaxOffsetNumber(page);
+ }
UnlockReleaseBuffer(buffer);
}
- lastBlock = npages - 1;
+
+ /* Update the metapage with accurate page and entry counts */
+ idxStat.nTotalPages = npages;
+ ginUpdateStats(info->index, &idxStat);
/* Finally, vacuum the FSM */
IndexFreeSpaceMapVacuum(info->index);
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index cff5bc8bd5..c954bcb12f 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -4,16 +4,16 @@
* WAL replay logic for inverted index.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.22 2010/02/09 20:31:24 heikki Exp $
+ * src/backend/access/gin/ginxlog.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/gin.h"
+#include "access/gin_private.h"
#include "access/xlogutils.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
@@ -77,11 +77,13 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true);
Assert(BufferIsValid(MetaBuffer));
+ page = (Page) BufferGetPage(MetaBuffer);
+
GinInitMetabuffer(MetaBuffer);
- page = (Page) BufferGetPage(MetaBuffer);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(MetaBuffer);
RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true);
Assert(BufferIsValid(RootBuffer));
@@ -91,11 +93,10 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
-
- MarkBufferDirty(MetaBuffer);
- UnlockReleaseBuffer(MetaBuffer);
MarkBufferDirty(RootBuffer);
+
UnlockReleaseBuffer(RootBuffer);
+ UnlockReleaseBuffer(MetaBuffer);
}
static void
@@ -128,21 +129,49 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
Buffer buffer;
Page page;
+ /* first, forget any incomplete split this insertion completes */
+ if (data->isData)
+ {
+ Assert(data->isDelete == FALSE);
+ if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
+ {
+ PostingItem *pitem;
+
+ pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert));
+ forgetIncompleteSplit(data->node,
+ PostingItemGetBlockNumber(pitem),
+ data->updateBlkno);
+ }
+
+ }
+ else
+ {
+ if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
+ {
+ IndexTuple itup;
+
+ itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
+ forgetIncompleteSplit(data->node,
+ GinGetDownlink(itup),
+ data->updateBlkno);
+ }
+ }
+
/* nothing else to do if page was backed up */
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
buffer = XLogReadBuffer(data->node, data->blkno, false);
- Assert(BufferIsValid(buffer));
+ if (!BufferIsValid(buffer))
+ return; /* page was deleted, nothing to do */
page = (Page) BufferGetPage(buffer);
- if (data->isData)
+ if (!XLByteLE(lsn, PageGetLSN(page)))
{
- Assert(data->isDelete == FALSE);
- Assert(GinPageIsData(page));
-
- if (!XLByteLE(lsn, PageGetLSN(page)))
+ if (data->isData)
{
+ Assert(GinPageIsData(page));
+
if (data->isLeaf)
{
OffsetNumber i;
@@ -172,30 +201,19 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
GinDataPageAddItem(page, pitem, data->offset);
}
}
-
- if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
+ else
{
- PostingItem *pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert));
-
- forgetIncompleteSplit(data->node, PostingItemGetBlockNumber(pitem), data->updateBlkno);
- }
+ IndexTuple itup;
- }
- else
- {
- IndexTuple itup;
+ Assert(!GinPageIsData(page));
- Assert(!GinPageIsData(page));
-
- if (!XLByteLE(lsn, PageGetLSN(page)))
- {
if (data->updateBlkno != InvalidBlockNumber)
{
/* update link to right page after split */
Assert(!GinPageIsLeaf(page));
Assert(data->offset >= FirstOffsetNumber && data->offset <= PageGetMaxOffsetNumber(page));
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, data->offset));
- ItemPointerSet(&itup->t_tid, data->updateBlkno, InvalidOffsetNumber);
+ GinSetDownlink(itup, data->updateBlkno);
}
if (data->isDelete)
@@ -212,20 +230,12 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
data->node.spcNode, data->node.dbNode, data->node.relNode);
}
- if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
- {
- itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
- forgetIncompleteSplit(data->node, GinItemPointerGetBlockNumber(&itup->t_tid), data->updateBlkno);
- }
- }
-
- if (!XLByteLE(lsn, PageGetLSN(page)))
- {
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
+
UnlockReleaseBuffer(buffer);
}
@@ -244,7 +254,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
if (data->isData)
flags |= GIN_DATA;
- lbuffer = XLogReadBuffer(data->node, data->lblkno, data->isRootSplit);
+ lbuffer = XLogReadBuffer(data->node, data->lblkno, true);
Assert(BufferIsValid(lbuffer));
lpage = (Page) BufferGetPage(lbuffer);
GinInitBuffer(lbuffer, flags);
@@ -260,7 +270,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
if (data->isData)
{
char *ptr = XLogRecGetData(record) + sizeof(ginxlogSplit);
- Size sizeofitem = GinSizeOfItem(lpage);
+ Size sizeofitem = GinSizeOfDataPageItem(lpage);
OffsetNumber i;
ItemPointer bound;
@@ -321,7 +331,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
if (data->isRootSplit)
{
- Buffer rootBuf = XLogReadBuffer(data->node, data->rootBlkno, false);
+ Buffer rootBuf = XLogReadBuffer(data->node, data->rootBlkno, true);
Page rootPage = BufferGetPage(rootBuf);
GinInitBuffer(rootBuf, flags & ~GIN_LEAF);
@@ -329,12 +339,12 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
if (data->isData)
{
Assert(data->rootBlkno != GIN_ROOT_BLKNO);
- dataFillRoot(NULL, rootBuf, lbuffer, rbuffer);
+ ginDataFillRoot(NULL, rootBuf, lbuffer, rbuffer);
}
else
{
Assert(data->rootBlkno == GIN_ROOT_BLKNO);
- entryFillRoot(NULL, rootBuf, lbuffer, rbuffer);
+ ginEntryFillRoot(NULL, rootBuf, lbuffer, rbuffer);
}
PageSetLSN(rootPage, lsn);
@@ -357,45 +367,50 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
Buffer buffer;
Page page;
- /* nothing else to do if page was backed up (and no info to do it with) */
+ /* nothing to do if page was backed up (and no info to do it with) */
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
buffer = XLogReadBuffer(data->node, data->blkno, false);
- Assert(BufferIsValid(buffer));
+ if (!BufferIsValid(buffer))
+ return;
page = (Page) BufferGetPage(buffer);
- if (GinPageIsData(page))
- {
- memcpy(GinDataPageGetData(page), XLogRecGetData(record) + sizeof(ginxlogVacuumPage),
- GinSizeOfItem(page) *data->nitem);
- GinPageGetOpaque(page)->maxoff = data->nitem;
- }
- else
+ if (!XLByteLE(lsn, PageGetLSN(page)))
{
- OffsetNumber i,
- *tod;
- IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage));
+ if (GinPageIsData(page))
+ {
+ memcpy(GinDataPageGetData(page),
+ XLogRecGetData(record) + sizeof(ginxlogVacuumPage),
+ data->nitem * GinSizeOfDataPageItem(page));
+ GinPageGetOpaque(page)->maxoff = data->nitem;
+ }
+ else
+ {
+ OffsetNumber i,
+ *tod;
+ IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage));
- tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page));
- for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
- tod[i - 1] = i;
+ tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page));
+ for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
+ tod[i - 1] = i;
- PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page));
+ PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page));
- for (i = 0; i < data->nitem; i++)
- {
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
- elog(ERROR, "failed to add item to index page in %u/%u/%u",
- data->node.spcNode, data->node.dbNode, data->node.relNode);
- itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
+ for (i = 0; i < data->nitem; i++)
+ {
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in %u/%u/%u",
+ data->node.spcNode, data->node.dbNode, data->node.relNode);
+ itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
+ }
}
- }
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
- MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
@@ -409,38 +424,56 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record)
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
buffer = XLogReadBuffer(data->node, data->blkno, false);
- page = BufferGetPage(buffer);
- Assert(GinPageIsData(page));
- GinPageGetOpaque(page)->flags = GIN_DELETED;
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
- UnlockReleaseBuffer(buffer);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ Assert(GinPageIsData(page));
+ GinPageGetOpaque(page)->flags = GIN_DELETED;
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
}
if (!(record->xl_info & XLR_BKP_BLOCK_2))
{
buffer = XLogReadBuffer(data->node, data->parentBlkno, false);
- page = BufferGetPage(buffer);
- Assert(GinPageIsData(page));
- Assert(!GinPageIsLeaf(page));
- PageDeletePostingItem(page, data->parentOffset);
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
- UnlockReleaseBuffer(buffer);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ Assert(GinPageIsData(page));
+ Assert(!GinPageIsLeaf(page));
+ GinPageDeletePostingItem(page, data->parentOffset);
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
}
if (!(record->xl_info & XLR_BKP_BLOCK_3) && data->leftBlkno != InvalidBlockNumber)
{
buffer = XLogReadBuffer(data->node, data->leftBlkno, false);
- page = BufferGetPage(buffer);
- Assert(GinPageIsData(page));
- GinPageGetOpaque(page)->rightlink = data->rightLink;
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
- UnlockReleaseBuffer(buffer);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ Assert(GinPageIsData(page));
+ GinPageGetOpaque(page)->rightlink = data->rightLink;
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
}
}
@@ -450,8 +483,11 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record);
Buffer metabuffer;
Page metapage;
+ Buffer buffer;
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
+ if (!BufferIsValid(metabuffer))
+ elog(PANIC, "GIN metapage disappeared");
metapage = BufferGetPage(metabuffer);
if (!XLByteLE(lsn, PageGetLSN(metapage)))
@@ -469,40 +505,43 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
*/
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
- Buffer buffer = XLogReadBuffer(data->node, data->metadata.tail, false);
- Page page = BufferGetPage(buffer);
-
- if (!XLByteLE(lsn, PageGetLSN(page)))
+ buffer = XLogReadBuffer(data->node, data->metadata.tail, false);
+ if (BufferIsValid(buffer))
{
- OffsetNumber l,
- off = (PageIsEmpty(page)) ? FirstOffsetNumber :
- OffsetNumberNext(PageGetMaxOffsetNumber(page));
- int i,
- tupsize;
- IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta));
-
- for (i = 0; i < data->ntuples; i++)
+ Page page = BufferGetPage(buffer);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
{
- tupsize = IndexTupleSize(tuples);
+ OffsetNumber l,
+ off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+ OffsetNumberNext(PageGetMaxOffsetNumber(page));
+ int i,
+ tupsize;
+ IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta));
- l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
+ for (i = 0; i < data->ntuples; i++)
+ {
+ tupsize = IndexTupleSize(tuples);
- if (l == InvalidOffsetNumber)
- elog(ERROR, "failed to add item to index page");
+ l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
- tuples = (IndexTuple) (((char *) tuples) + tupsize);
- }
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page");
- /*
- * Increase counter of heap tuples
- */
- GinPageGetOpaque(page)->maxoff++;
+ tuples = (IndexTuple) (((char *) tuples) + tupsize);
+ }
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
+ /*
+ * Increase counter of heap tuples
+ */
+ GinPageGetOpaque(page)->maxoff++;
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
}
- UnlockReleaseBuffer(buffer);
}
}
else if (data->prevTail != InvalidBlockNumber)
@@ -510,19 +549,21 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
/*
* New tail
*/
-
- Buffer buffer = XLogReadBuffer(data->node, data->prevTail, false);
- Page page = BufferGetPage(buffer);
-
- if (!XLByteLE(lsn, PageGetLSN(page)))
+ buffer = XLogReadBuffer(data->node, data->prevTail, false);
+ if (BufferIsValid(buffer))
{
- GinPageGetOpaque(page)->rightlink = data->newRightlink;
+ Page page = BufferGetPage(buffer);
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ GinPageGetOpaque(page)->rightlink = data->newRightlink;
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
}
- UnlockReleaseBuffer(buffer);
}
UnlockReleaseBuffer(metabuffer);
@@ -544,6 +585,7 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record)
return;
buffer = XLogReadBuffer(data->node, data->blkno, true);
+ Assert(BufferIsValid(buffer));
page = BufferGetPage(buffer);
GinInitBuffer(buffer, GIN_LIST);
@@ -587,6 +629,8 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
int i;
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
+ if (!BufferIsValid(metabuffer))
+ elog(PANIC, "GIN metapage disappeared");
metapage = BufferGetPage(metabuffer);
if (!XLByteLE(lsn, PageGetLSN(metapage)))
@@ -600,18 +644,22 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
for (i = 0; i < data->ndeleted; i++)
{
Buffer buffer = XLogReadBuffer(data->node, data->toDelete[i], false);
- Page page = BufferGetPage(buffer);
- if (!XLByteLE(lsn, PageGetLSN(page)))
+ if (BufferIsValid(buffer))
{
- GinPageGetOpaque(page)->flags = GIN_DELETED;
+ Page page = BufferGetPage(buffer);
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
- MarkBufferDirty(buffer);
- }
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ GinPageGetOpaque(page)->flags = GIN_DELETED;
- UnlockReleaseBuffer(buffer);
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
}
UnlockReleaseBuffer(metabuffer);
}
@@ -745,6 +793,7 @@ static void
ginContinueSplit(ginIncompleteSplit *split)
{
GinBtreeData btree;
+ GinState ginstate;
Relation reln;
Buffer buffer;
GinBtreeStack stack;
@@ -755,18 +804,30 @@ ginContinueSplit(ginIncompleteSplit *split)
*/
buffer = XLogReadBuffer(split->node, split->leftBlkno, false);
+ /*
+ * Failure should be impossible here, because we wrote the page earlier.
+ */
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "ginContinueSplit: left block %u not found",
+ split->leftBlkno);
+
reln = CreateFakeRelcacheEntry(split->node);
if (split->rootBlkno == GIN_ROOT_BLKNO)
{
- prepareEntryScan(&btree, reln, InvalidOffsetNumber, (Datum) 0, NULL);
+ MemSet(&ginstate, 0, sizeof(ginstate));
+ ginstate.index = reln;
+
+ ginPrepareEntryScan(&btree,
+ InvalidOffsetNumber, (Datum) 0, GIN_CAT_NULL_KEY,
+ &ginstate);
btree.entry = ginPageGetLinkItup(buffer);
}
else
{
Page page = BufferGetPage(buffer);
- prepareDataScan(&btree, reln);
+ ginPrepareDataScan(&btree, reln);
PostingItemSetBlockNumber(&(btree.pitem), split->leftBlkno);
if (GinPageIsLeaf(page))
@@ -784,8 +845,8 @@ ginContinueSplit(ginIncompleteSplit *split)
stack.off = InvalidOffsetNumber;
stack.parent = NULL;
- findParents(&btree, &stack, split->rootBlkno);
- ginInsertValue(&btree, stack.parent);
+ ginFindParents(&btree, &stack, split->rootBlkno);
+ ginInsertValue(&btree, stack.parent, NULL);
FreeFakeRelcacheEntry(reln);
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile
index 298e9309f5..f8051a2b45 100644
--- a/src/backend/access/gist/Makefile
+++ b/src/backend/access/gist/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/gist
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/gist/Makefile,v 1.18 2008/02/19 10:30:06 petere Exp $
+# src/backend/access/gist/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 6c90e508bf..2d78dcb0df 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/gist/README,v 1.5 2010/04/14 20:17:26 rhaas Exp $
+src/backend/access/gist/README
GiST Indexing
=============
@@ -20,33 +20,34 @@ The current implementation of GiST supports:
* Variable length keys
* Composite keys (multi-key)
+ * Ordered search (nearest-neighbor search)
* provides NULL-safe interface to GiST core
* Concurrency
* Recovery support via WAL logging
-The support for concurrency implemented in PostgreSQL was developed based on
-the paper "Access Methods for Next-Generation Database Systems" by
+The support for concurrency implemented in PostgreSQL was developed based on
+the paper "Access Methods for Next-Generation Database Systems" by
Marcel Kornaker:
http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
The original algorithms were modified in several ways:
-* They should be adapted to PostgreSQL conventions. For example, the SEARCH
- algorithm was considerably changed, because in PostgreSQL function search
- should return one tuple (next), not all tuples at once. Also, it should
+* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
+ algorithm was considerably changed, because in PostgreSQL the search function
+ should return one tuple (next), not all tuples at once. Also, it should
release page locks between calls.
-* Since we added support for variable length keys, it's not possible to
- guarantee enough free space for all keys on pages after splitting. User
- defined function picksplit doesn't have information about size of tuples
+* Since we added support for variable length keys, it's not possible to
+ guarantee enough free space for all keys on pages after splitting. User
+ defined function picksplit doesn't have information about size of tuples
(each tuple may contain several keys as in multicolumn index while picksplit
could work with only one key) and pages.
-* We modified original INSERT algorithm for performance reason. In particular,
+* We modified original INSERT algorithm for performance reasons. In particular,
it is now a single-pass algorithm.
* Since the papers were theoretical, some details were omitted and we
- have to find out ourself how to solve some specific problems.
+ had to find out ourself how to solve some specific problems.
-Because of the above reasons, we have to revised interaction of GiST
+Because of the above reasons, we have revised the interaction of GiST
core and PostgreSQL WAL system. Moreover, we encountered (and solved)
a problem of uncompleted insertions when recovering after crash, which
was not touched in the paper.
@@ -54,96 +55,127 @@ was not touched in the paper.
Search Algorithm
----------------
-Function gettuple finds a tuple which satisfies the search
-predicate. It store their state and returns next tuple under
-subsequent calls. Stack contains page, its LSN and LSN of parent page
-and currentposition is saved between calls.
+The search code maintains a queue of unvisited items, where an "item" is
+either a heap tuple known to satisfy the search conditions, or an index
+page that is consistent with the search conditions according to inspection
+of its parent page's downlink item. Initially the root page is searched
+to find unvisited items in it. Then we pull items from the queue. A
+heap tuple pointer is just returned immediately; an index page entry
+causes that page to be searched, generating more queue entries.
-gettuple(search-pred)
- if ( firsttime )
- push(stack, [root, 0, 0]) // page, LSN, parentLSN
- currentposition=0
- end
- ptr = top of stack
- while(true)
- latch( ptr->page, S-mode )
- if ( ptr->page->lsn != ptr->lsn )
- ptr->lsn = ptr->page->lsn
- currentposition=0
- if ( ptr->parentlsn < ptr->page->nsn )
- add to stack rightlink
- else
- currentposition++
- end
+The queue is kept ordered with heap tuple items at the front, then
+index page entries, with any newly-added index page entry inserted
+before existing index page entries. This ensures depth-first traversal
+of the index, and in particular causes the first few heap tuples to be
+returned as soon as possible. That is helpful in case there is a LIMIT
+that requires only a few tuples to be produced.
- while(true)
- currentposition = find_first_match( currentposition )
- if ( currentposition is invalid )
- unlatch( ptr->page )
- pop stack
- ptr = top of stack
- if (ptr is NULL)
- return NULL
- break loop
- else if ( ptr->page is leaf )
- unlatch( ptr->page )
- return tuple
- else
- add to stack child page
- end
- currentposition++
- end
- end
+To implement nearest-neighbor search, the queue entries are augmented
+with distance data: heap tuple entries are labeled with exact distance
+from the search argument, while index-page entries must be labeled with
+the minimum distance that any of their children could have. Then,
+queue entries are retrieved in smallest-distance-first order, with
+entries having identical distances managed as stated in the previous
+paragraph.
+
+The search algorithm keeps an index page locked only long enough to scan
+its entries and queue those that satisfy the search conditions. Since
+insertions can occur concurrently with searches, it is possible for an
+index child page to be split between the time we make a queue entry for it
+(while visiting its parent page) and the time we actually reach and scan
+the child page. To avoid missing the entries that were moved to the right
+sibling, we detect whether a split has occurred by comparing the child
+page's NSN to the LSN that the parent had when visited. If it did, the
+sibling page is immediately added to the front of the queue, ensuring that
+its items will be scanned in the same order as if they were still on the
+original child page.
+
+As is usual in Postgres, the search algorithm only guarantees to find index
+entries that existed before the scan started; index entries added during
+the scan might or might not be visited. This is okay as long as all
+searches use MVCC snapshot rules to reject heap tuples newer than the time
+of scan start. In particular, this means that we need not worry about
+cases where a parent page's downlink key is "enlarged" after we look at it.
+Any such enlargement would be to add child items that we aren't interested
+in returning anyway.
Insert Algorithm
----------------
-INSERT guarantees that the GiST tree remains balanced. User defined key method
-Penalty is used for choosing a subtree to insert; method PickSplit is used for
-the node splitting algorithm; method Union is used for propagating changes
+INSERT guarantees that the GiST tree remains balanced. User defined key method
+Penalty is used for choosing a subtree to insert; method PickSplit is used for
+the node splitting algorithm; method Union is used for propagating changes
upward to maintain the tree properties.
-NOTICE: We modified original INSERT algorithm for performance reason. In
-particularly, it is now a single-pass algorithm.
+To insert a tuple, we first have to find a suitable leaf page to insert to.
+The algorithm walks down the tree, starting from the root, along the path
+of smallest Penalty. At each step:
-Function findLeaf is used to identify subtree for insertion. Page, in which
-insertion is proceeded, is locked as well as its parent page. Functions
-findParent and findPath are used to find parent pages, which could be changed
-because of concurrent access. Function pageSplit is recurrent and could split
-page by more than 2 pages, which could be necessary if keys have different
-lengths or more than one key are inserted (in such situation, user defined
-function pickSplit cannot guarantee free space on page).
+1. Has this page been split since we looked at the parent? If so, it's
+possible that we should be inserting to the other half instead, so retreat
+back to the parent.
+2. If this is a leaf node, we've found our target node.
+3. Otherwise use Penalty to pick a new target subtree.
+4. Check the key representing the target subtree. If it doesn't already cover
+the key we're inserting, replace it with the Union of the old downlink key
+and the key being inserted. (Actually, we always call Union, and just skip
+the replacement if the Unioned key is the same as the existing key)
+5. Replacing the key in step 4 might cause the page to be split. In that case,
+propagate the change upwards and restart the algorithm from the first parent
+that didn't need to be split.
+6. Walk down to the target subtree, and goto 1.
+
+This differs from the insertion algorithm in the original paper. In the
+original paper, you first walk down the tree until you reach a leaf page, and
+then you adjust the downlink in the parent, and propagating the adjustment up,
+all the way up to the root in the worst case. But we adjust the downlinks to
+cover the new key already when we walk down, so that when we reach the leaf
+page, we don't need to update the parents anymore, except to insert the
+downlinks if we have to split the page. This makes crash recovery simpler:
+after inserting a key to the page, the tree is immediately self-consistent
+without having to update the parents. Even if we split a page and crash before
+inserting the downlink to the parent, the tree is self-consistent because the
+right half of the split is accessible via the rightlink of the left page
+(which replaced the original page).
+
+Note that the algorithm can walk up and down the tree before reaching a leaf
+page, if internal pages need to split while adjusting the downlinks for the
+new key. Eventually, you should reach the bottom, and proceed with the
+insertion of the new tuple.
+
+Once we've found the target page to insert to, we check if there's room
+for the new tuple. If there is, the tuple is inserted, and we're done.
+If it doesn't fit, however, the page needs to be split. Note that it is
+possible that a page needs to be split into more than two pages, if keys have
+different lengths or more than one key is being inserted at a time (which can
+happen when inserting downlinks for a page split that resulted in more than
+two pages at the lower level). After splitting a page, the parent page needs
+to be updated. The downlink for the new page needs to be inserted, and the
+downlink for the old page, which became the left half of the split, needs to
+be updated to only cover those tuples that stayed on the left page. Inserting
+the downlink in the parent can again lead to a page split, recursing up to the
+root page in the worst case.
+
+gistplacetopage is the workhorse function that performs one step of the
+insertion. If the tuple fits, it inserts it to the given page, otherwise
+it splits the page, and constructs the new downlink tuples for the split
+pages. The caller must then call gistplacetopage() on the parent page to
+insert the downlink tuples. The parent page that holds the downlink to
+the child might have migrated as a result of concurrent splits of the
+parent, gistfindCorrectParent() is used to find the parent page.
+
+Splitting the root page works slightly differently. At root split,
+gistplacetopage() allocates the new child pages and replaces the old root
+page with the new root containing downlinks to the new children, all in one
+operation.
-findLeaf(new-key)
- push(stack, [root, 0]) //page, LSN
- while(true)
- ptr = top of stack
- latch( ptr->page, S-mode )
- ptr->lsn = ptr->page->lsn
- if ( exists ptr->parent AND ptr->parent->lsn < ptr->page->nsn )
- unlatch( ptr->page )
- pop stack
- else if ( ptr->page is not leaf )
- push( stack, [get_best_child(ptr->page, new-key), 0] )
- unlatch( ptr->page )
- else
- unlatch( ptr->page )
- latch( ptr->page, X-mode )
- if ( ptr->page is not leaf )
- //the only root page can become a non-leaf
- unlatch( ptr->page )
- else if ( ptr->parent->lsn < ptr->page->nsn )
- unlatch( ptr->page )
- pop stack
- else
- return stack
- end
- end
- end
+
+findPath is a subroutine of findParent, used when the correct parent page
+can't be found by following the rightlinks at the parent level:
findPath( stack item )
- push stack, [root, 0, 0] // page, LSN, parent
+ push stack, [root, 0, 0] // page, LSN, parent
while( stack )
ptr = top of stack
latch( ptr->page, S-mode )
@@ -152,7 +184,7 @@ findPath( stack item )
end
for( each tuple on page )
if ( tuple->pagepointer == item->page )
- return stack
+ return stack
else
add to stack at the end [tuple->pagepointer,0, ptr]
end
@@ -160,12 +192,16 @@ findPath( stack item )
unlatch( ptr->page )
pop stack
end
-
+
+
+gistFindCorrectParent is used to re-find the parent of a page during
+insertion. It might have migrated to the right since we traversed down the
+tree because of page splits.
+
findParent( stack item )
parent = item->parent
- latch( parent->page, X-mode )
if ( parent->page->lsn != parent->lsn )
- while(true)
+ while(true)
search parent tuple on parent->page, if found the return
rightlink = parent->page->rightlink
unlatch( parent->page )
@@ -177,9 +213,13 @@ findParent( stack item )
end
newstack = findPath( item->parent )
replace part of stack to new one
+ latch( parent->page, X-mode )
return findParent( item )
end
+pageSplit function decides how to distribute keys to the new pages after
+page split:
+
pageSplit(page, allkeys)
(lkeys, rkeys) = pickSplit( allkeys )
if ( page is root )
@@ -200,40 +240,45 @@ pageSplit(page, allkeys)
return newkeys
-placetopage(page, keysarray)
- if ( no space left on page )
- keysarray = pageSplit(page, [ extract_keys(page), keysarray])
- last page in chain gets old NSN,
- original and others - new NSN equals to LSN
- if ( page is root )
- make new root with keysarray
- end
- else
- put keysarray on page
- if ( length of keysarray > 1 )
- keysarray = [ union(keysarray) ]
- end
- end
-
-insert(new-key)
- stack = findLeaf(new-key)
- keysarray = [new-key]
- ptr = top of stack
- while(true)
- findParent( ptr ) //findParent latches parent page
- keysarray = placetopage(ptr->page, keysarray)
- unlatch( ptr->page )
- pop stack;
- ptr = top of stack
- if (length of keysarray == 1)
- newboundingkey = union(oldboundingkey, keysarray)
- if (newboundingkey == oldboundingkey)
- unlatch ptr->page
- break loop
- end
- end
- end
+
+Concurrency control
+-------------------
+As a rule of thumb, if you need to hold a lock on multiple pages at the
+same time, the locks should be acquired in the following order: child page
+before parent, and left-to-right at the same level. Always acquiring the
+locks in the same order avoids deadlocks.
+
+The search algorithm only looks at and locks one page at a time. Consequently
+there's a race condition between a search and a page split. A page split
+happens in two phases: 1. The page is split 2. The downlink is inserted to the
+parent. If a search looks at the parent page between those steps, before the
+downlink is inserted, it will still find the new right half by following the
+rightlink on the left half. But it must not follow the rightlink if it saw the
+downlink in the parent, or the page will be visited twice!
+
+A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan
+sees that flag set, it knows that the right page is missing the downlink, and
+should be visited too. When split inserts the downlink to the parent, it
+clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the
+child page header to match the LSN of the insertion on the parent. If the
+F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the
+LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page
+before the downlink was inserted, so it should follow the rightlink. Otherwise
+the scan saw the downlink in the parent page, and will/did follow that as
+usual.
+
+A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because
+a page split keeps the child pages locked until the downlink has been inserted
+to the parent and the flag cleared again. But if a crash happens in the middle
+of a page split, before the downlinks are inserted into the parent, that will
+leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine,
+but we'll eventually want to fix that for performance reasons. And more
+importantly, dealing with pages with missing downlink pointers in the parent
+would complicate the insertion algorithm. So when an insertion sees a page
+with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
+crashed in the middle to completion by adding the downlink in the parent.
+
Authors:
Teodor Sigaev <teodor@sigaev.ru>
- Oleg Bartunov <oleg@sai.msu.su>
+ Oleg Bartunov <oleg@sai.msu.su>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index cec08c7226..8227bfdb88 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -4,11 +4,11 @@
* interface routines for the postgres GiST index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.158 2010/01/02 16:57:33 momjian Exp $
+ * src/backend/access/gist/gist.c
*
*-------------------------------------------------------------------------
*/
@@ -17,13 +17,12 @@
#include "access/genam.h"
#include "access/gist_private.h"
#include "catalog/index.h"
+#include "catalog/pg_collation.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "utils/memutils.h"
-const XLogRecPtr XLogRecPtrForTemp = {1, 1};
-
/* Working state for gistbuild and its callback */
typedef struct
{
@@ -33,6 +32,12 @@ typedef struct
MemoryContext tmpCtx;
} GISTBuildState;
+/* A List of these is used represent a split-in-progress. */
+typedef struct
+{
+ Buffer buf; /* the split page "half" */
+ IndexTuple downlink; /* downlink for this half. */
+} GISTPageSplitInfo;
/* non-export function prototypes */
static void gistbuildCallback(Relation index,
@@ -45,8 +50,13 @@ static void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
GISTSTATE *GISTstate);
-static void gistfindleaf(GISTInsertState *state,
- GISTSTATE *giststate);
+static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
+static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate,
+ IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchild);
+static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, List *splitinfo);
#define ROTATEDIST(d) do { \
@@ -117,7 +127,7 @@ gistbuild(PG_FUNCTION_ARGS)
MarkBufferDirty(buffer);
- if (!index->rd_istemp)
+ if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
XLogRecData rdata;
@@ -132,7 +142,7 @@ gistbuild(PG_FUNCTION_ARGS)
PageSetTLI(page, ThisTimeLineID);
}
else
- PageSetLSN(page, XLogRecPtrForTemp);
+ PageSetLSN(page, GetXLogRecPtrForTemp());
UnlockReleaseBuffer(buffer);
@@ -210,6 +220,19 @@ gistbuildCallback(Relation index,
}
/*
+ * gistbuildempty() -- build an empty gist index in the initialization fork
+ */
+Datum
+gistbuildempty(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("unlogged GiST indexes are not supported")));
+
+ PG_RETURN_VOID();
+}
+
+/*
* gistinsert -- wrapper for GiST tuple insertion.
*
* This is the public interface routine for tuple insertion in GiSTs.
@@ -253,41 +276,52 @@ gistinsert(PG_FUNCTION_ARGS)
/*
- * Workhouse routine for doing insertion into a GiST index. Note that
- * this routine assumes it is invoked in a short-lived memory context,
- * so it does not bother releasing palloc'd allocations.
+ * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
+ * at that offset is atomically removed along with inserting the new tuples.
+ * This is used to replace a tuple with a new one.
+ *
+ * If 'leftchildbuf' is valid, we're inserting the downlink for the page
+ * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
+ * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
+ *
+ * If there is not enough room on the page, it is split. All the split
+ * pages are kept pinned and locked and returned in *splitinfo, the caller
+ * is responsible for inserting the downlinks for them. However, if
+ * 'buffer' is the root page and it needs to be split, gistplacetopage()
+ * performs the split as one atomic operation, and *splitinfo is set to NIL.
+ * In that case, we continue to hold the root page locked, and the child
+ * pages are released; note that new tuple(s) are *not* on the root page
+ * but in one of the new child pages.
*/
-static void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+static bool
+gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
+ Buffer buffer,
+ IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchildbuf,
+ List **splitinfo)
{
- GISTInsertState state;
-
- memset(&state, 0, sizeof(GISTInsertState));
-
- state.itup = (IndexTuple *) palloc(sizeof(IndexTuple));
- state.itup[0] = (IndexTuple) palloc(IndexTupleSize(itup));
- memcpy(state.itup[0], itup, IndexTupleSize(itup));
- state.ituplen = 1;
- state.freespace = freespace;
- state.r = r;
- state.key = itup->t_tid;
- state.needInsertComplete = true;
+ Page page = BufferGetPage(buffer);
+ bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
+ XLogRecPtr recptr;
+ int i;
+ bool is_split;
- state.stack = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
- state.stack->blkno = GIST_ROOT_BLKNO;
+ /*
+ * Refuse to modify a page that's incompletely split. This should not
+ * happen because we finish any incomplete splits while we walk down the
+ * tree. However, it's remotely possible that another concurrent inserter
+ * splits a parent page, and errors out before completing the split. We
+ * will just throw an error in that case, and leave any split we had in
+ * progress unfinished too. The next insert that comes along will clean up
+ * the mess.
+ */
+ if (GistFollowRight(page))
+ elog(ERROR, "concurrent GiST page split was incomplete");
- gistfindleaf(&state, giststate);
- gistmakedeal(&state, giststate);
-}
-
-static bool
-gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
-{
- bool is_splitted = false;
- bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false;
+ *splitinfo = NIL;
/*
- * if (!is_leaf) remove old key: This node's key has been modified, either
+ * if isupdate, remove old key: This node's key has been modified, either
* because a child split occurred or because we needed to adjust our key
* for an insert in a child node. Therefore, remove the old version of
* this node's key.
@@ -295,77 +329,136 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
* for WAL replay, in the non-split case we handle this by setting up a
* one-element todelete array; in the split case, it's handled implicitly
* because the tuple vector passed to gistSplit won't include this tuple.
- *
- * XXX: If we want to change fillfactors between node and leaf, fillfactor
- * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor)
*/
- if (gistnospace(state->stack->page, state->itup, state->ituplen,
- is_leaf ? InvalidOffsetNumber : state->stack->childoffnum,
- state->freespace))
+ is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
+ if (is_split)
{
/* no space for insertion */
IndexTuple *itvec;
int tlen;
SplitedPageLayout *dist = NULL,
*ptr;
- BlockNumber rrlink = InvalidBlockNumber;
- GistNSN oldnsn;
+ BlockNumber oldrlink = InvalidBlockNumber;
+ GistNSN oldnsn = {0, 0};
+ SplitedPageLayout rootpg;
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ bool is_rootsplit;
- is_splitted = true;
+ is_rootsplit = (blkno == GIST_ROOT_BLKNO);
/*
- * Form index tuples vector to split: remove old tuple if t's needed
- * and add new tuples to vector
+ * Form index tuples vector to split. If we're replacing an old tuple,
+ * remove the old version from the vector.
*/
- itvec = gistextractpage(state->stack->page, &tlen);
- if (!is_leaf)
+ itvec = gistextractpage(page, &tlen);
+ if (OffsetNumberIsValid(oldoffnum))
{
/* on inner page we should remove old tuple */
- int pos = state->stack->childoffnum - FirstOffsetNumber;
+ int pos = oldoffnum - FirstOffsetNumber;
tlen--;
if (pos != tlen)
memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
}
- itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen);
- dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate);
+ itvec = gistjoinvector(itvec, &tlen, itup, ntup);
+ dist = gistSplit(state->r, page, itvec, tlen, giststate);
- state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen);
- state->ituplen = 0;
-
- if (state->stack->blkno != GIST_ROOT_BLKNO)
+ /*
+ * Set up pages to work with. Allocate new buffers for all but the
+ * leftmost page. The original page becomes the new leftmost page, and
+ * is just replaced with the new contents.
+ *
+ * For a root-split, allocate new buffers for all child pages, the
+ * original page is overwritten with new root page containing
+ * downlinks to the new child pages.
+ */
+ ptr = dist;
+ if (!is_rootsplit)
{
- /*
- * if non-root split then we should not allocate new buffer, but
- * we must create temporary page to operate
- */
- dist->buffer = state->stack->buffer;
- dist->page = PageGetTempPageCopySpecial(BufferGetPage(dist->buffer));
+ /* save old rightlink and NSN */
+ oldrlink = GistPageGetOpaque(page)->rightlink;
+ oldnsn = GistPageGetOpaque(page)->nsn;
+
+ dist->buffer = buffer;
+ dist->block.blkno = BufferGetBlockNumber(buffer);
+ dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
/* clean all flags except F_LEAF */
GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
+
+ ptr = ptr->next;
+ }
+ for (; ptr; ptr = ptr->next)
+ {
+ /* Allocate new page */
+ ptr->buffer = gistNewBuffer(state->r);
+ GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
+ ptr->page = BufferGetPage(ptr->buffer);
+ ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
}
- /* make new pages and fills them */
+ /*
+ * Now that we know whick blocks the new pages go to, set up downlink
+ * tuples to point to them.
+ */
for (ptr = dist; ptr; ptr = ptr->next)
{
+ ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
+ GistTupleSetValid(ptr->itup);
+ }
+
+ /*
+ * If this is a root split, we construct the new root page with the
+ * downlinks here directly, instead of requiring the caller to insert
+ * them. Add the new root page to the list along with the child pages.
+ */
+ if (is_rootsplit)
+ {
+ IndexTuple *downlinks;
+ int ndownlinks = 0;
int i;
- char *data;
- /* get new page */
- if (ptr->buffer == InvalidBuffer)
+ rootpg.buffer = buffer;
+ rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer));
+ GistPageGetOpaque(rootpg.page)->flags = 0;
+
+ /* Prepare a vector of all the downlinks */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ ndownlinks++;
+ downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
+ for (i = 0, ptr = dist; ptr; ptr = ptr->next)
+ downlinks[i++] = ptr->itup;
+
+ rootpg.block.blkno = GIST_ROOT_BLKNO;
+ rootpg.block.num = ndownlinks;
+ rootpg.list = gistfillitupvec(downlinks, ndownlinks,
+ &(rootpg.lenlist));
+ rootpg.itup = NULL;
+
+ rootpg.next = dist;
+ dist = &rootpg;
+ }
+ else
+ {
+ /* Prepare split-info to be returned to caller */
+ for (ptr = dist; ptr; ptr = ptr->next)
{
- ptr->buffer = gistNewBuffer(state->r);
- GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
- ptr->page = BufferGetPage(ptr->buffer);
+ GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+
+ si->buf = ptr->buffer;
+ si->downlink = ptr->itup;
+ *splitinfo = lappend(*splitinfo, si);
}
- ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
+ }
+
+ /*
+ * Fill all pages. All the pages are new, ie. freshly allocated empty
+ * pages, or a temporary copy of the old page.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ char *data = (char *) (ptr->list);
- /*
- * fill page, we can do it because all these pages are new (ie not
- * linked in tree or masked by temp page
- */
- data = (char *) (ptr->list);
for (i = 0; i < ptr->block.num; i++)
{
if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
@@ -373,276 +466,388 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
data += IndexTupleSize((IndexTuple) data);
}
- /* set up ItemPointer and remember it for parent */
- ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
- state->itup[state->ituplen] = ptr->itup;
- state->ituplen++;
- }
+ /* Set up rightlinks */
+ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
+ GistPageGetOpaque(ptr->page)->rightlink =
+ ptr->next->block.blkno;
+ else
+ GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
- /* saves old rightlink */
- if (state->stack->blkno != GIST_ROOT_BLKNO)
- rrlink = GistPageGetOpaque(dist->page)->rightlink;
+ if (ptr->next && !is_rootsplit)
+ GistMarkFollowRight(ptr->page);
+ else
+ GistClearFollowRight(ptr->page);
+
+ /*
+ * Copy the NSN of the original page to all pages. The
+ * F_FOLLOW_RIGHT flags ensure that scans will follow the
+ * rightlinks until the downlinks are inserted.
+ */
+ GistPageGetOpaque(ptr->page)->nsn = oldnsn;
+ }
START_CRIT_SECTION();
/*
- * must mark buffers dirty before XLogInsert, even though we'll still
- * be changing their opaque fields below. set up right links.
+ * Must mark buffers dirty before XLogInsert, even though we'll still
+ * be changing their opaque fields below.
*/
for (ptr = dist; ptr; ptr = ptr->next)
- {
MarkBufferDirty(ptr->buffer);
- GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ?
- ptr->next->block.blkno : rrlink;
- }
-
- /* restore splitted non-root page */
- if (state->stack->blkno != GIST_ROOT_BLKNO)
- {
- PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
- dist->page = BufferGetPage(dist->buffer);
- }
-
- if (!state->r->rd_istemp)
- {
- XLogRecPtr recptr;
- XLogRecData *rdata;
-
- rdata = formSplitRdata(state->r->rd_node, state->stack->blkno,
- is_leaf, &(state->key), dist);
+ if (BufferIsValid(leftchildbuf))
+ MarkBufferDirty(leftchildbuf);
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
+ /*
+ * The first page in the chain was a temporary working copy meant to
+ * replace the old page. Copy it over the old page.
+ */
+ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
+ dist->page = BufferGetPage(dist->buffer);
- for (ptr = dist; ptr; ptr = ptr->next)
- {
- PageSetLSN(ptr->page, recptr);
- PageSetTLI(ptr->page, ThisTimeLineID);
- }
- }
+ /* Write the WAL record */
+ if (RelationNeedsWAL(state->r))
+ recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
+ dist, oldrlink, oldnsn, leftchildbuf);
else
- {
- for (ptr = dist; ptr; ptr = ptr->next)
- {
- PageSetLSN(ptr->page, XLogRecPtrForTemp);
- }
- }
-
- /* set up NSN */
- oldnsn = GistPageGetOpaque(dist->page)->nsn;
- if (state->stack->blkno == GIST_ROOT_BLKNO)
- /* if root split we should put initial value */
- oldnsn = PageGetLSN(dist->page);
+ recptr = GetXLogRecPtrForTemp();
for (ptr = dist; ptr; ptr = ptr->next)
{
- /* only for last set oldnsn */
- GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ?
- PageGetLSN(ptr->page) : oldnsn;
+ PageSetLSN(ptr->page, recptr);
+ PageSetTLI(ptr->page, ThisTimeLineID);
}
/*
- * release buffers, if it was a root split then release all buffers
- * because we create all buffers
+ * Return the new child buffers to the caller.
+ *
+ * If this was a root split, we've already inserted the downlink
+ * pointers, in the form of a new root page. Therefore we can release
+ * all the new buffers, and keep just the root page locked.
*/
- ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next;
- for (; ptr; ptr = ptr->next)
- UnlockReleaseBuffer(ptr->buffer);
-
- if (state->stack->blkno == GIST_ROOT_BLKNO)
+ if (is_rootsplit)
{
- gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key));
- state->needInsertComplete = false;
+ for (ptr = dist->next; ptr; ptr = ptr->next)
+ UnlockReleaseBuffer(ptr->buffer);
}
-
- END_CRIT_SECTION();
}
else
{
- /* enough space */
+ /*
+ * Enough space. We also get here if ntuples==0.
+ */
START_CRIT_SECTION();
- if (!is_leaf)
- PageIndexTupleDelete(state->stack->page, state->stack->childoffnum);
- gistfillbuffer(state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber);
+ if (OffsetNumberIsValid(oldoffnum))
+ PageIndexTupleDelete(page, oldoffnum);
+ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
- MarkBufferDirty(state->stack->buffer);
+ MarkBufferDirty(buffer);
- if (!state->r->rd_istemp)
+ if (BufferIsValid(leftchildbuf))
+ MarkBufferDirty(leftchildbuf);
+
+ if (RelationNeedsWAL(state->r))
{
- OffsetNumber noffs = 0,
- offs[1];
- XLogRecPtr recptr;
- XLogRecData *rdata;
+ OffsetNumber ndeloffs = 0,
+ deloffs[1];
- if (!is_leaf)
+ if (OffsetNumberIsValid(oldoffnum))
{
- /* only on inner page we should delete previous version */
- offs[0] = state->stack->childoffnum;
- noffs = 1;
+ deloffs[0] = oldoffnum;
+ ndeloffs = 1;
}
- rdata = formUpdateRdata(state->r->rd_node, state->stack->buffer,
- offs, noffs,
- state->itup, state->ituplen,
- &(state->key));
+ recptr = gistXLogUpdate(state->r->rd_node, buffer,
+ deloffs, ndeloffs, itup, ntup,
+ leftchildbuf);
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
- PageSetLSN(state->stack->page, recptr);
- PageSetTLI(state->stack->page, ThisTimeLineID);
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
}
else
- PageSetLSN(state->stack->page, XLogRecPtrForTemp);
-
- if (state->stack->blkno == GIST_ROOT_BLKNO)
- state->needInsertComplete = false;
+ {
+ recptr = GetXLogRecPtrForTemp();
+ PageSetLSN(page, recptr);
+ }
- END_CRIT_SECTION();
+ *splitinfo = NIL;
+ }
- if (state->ituplen > 1)
- { /* previous is_splitted==true */
+ /*
+ * If we inserted the downlink for a child page, set NSN and clear
+ * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
+ * follow the rightlink if and only if they looked at the parent page
+ * before we inserted the downlink.
+ *
+ * Note that we do this *after* writing the WAL record. That means that
+ * the possible full page image in the WAL record does not include these
+ * changes, and they must be replayed even if the page is restored from
+ * the full page image. There's a chicken-and-egg problem: if we updated
+ * the child pages first, we wouldn't know the recptr of the WAL record
+ * we're about to write.
+ */
+ if (BufferIsValid(leftchildbuf))
+ {
+ Page leftpg = BufferGetPage(leftchildbuf);
- /*
- * child was splited, so we must form union for insertion in
- * parent
- */
- IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate);
+ GistPageGetOpaque(leftpg)->nsn = recptr;
+ GistClearFollowRight(leftpg);
- ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno);
- state->itup[0] = newtup;
- state->ituplen = 1;
- }
- else if (is_leaf)
- {
- /*
- * itup[0] store key to adjust parent, we set it to valid to
- * correct check by GistTupleIsInvalid macro in gistgetadjusted()
- */
- ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno);
- GistTupleSetValid(state->itup[0]);
- }
+ PageSetLSN(leftpg, recptr);
+ PageSetTLI(leftpg, ThisTimeLineID);
}
- return is_splitted;
+
+ END_CRIT_SECTION();
+
+ return is_split;
}
/*
- * returns stack of pages, all pages in stack are pinned, and
- * leaf is X-locked
+ * Workhouse routine for doing insertion into a GiST index. Note that
+ * this routine assumes it is invoked in a short-lived memory context,
+ * so it does not bother releasing palloc'd allocations.
*/
-
static void
-gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
ItemId iid;
IndexTuple idxtuple;
- GISTPageOpaque opaque;
+ GISTInsertStack firststack;
+ GISTInsertStack *stack;
+ GISTInsertState state;
+ bool xlocked = false;
+
+ memset(&state, 0, sizeof(GISTInsertState));
+ state.freespace = freespace;
+ state.r = r;
+
+ /* Start from the root */
+ firststack.blkno = GIST_ROOT_BLKNO;
+ firststack.lsn.xrecoff = 0;
+ firststack.parent = NULL;
+ state.stack = stack = &firststack;
/*
- * walk down, We don't lock page for a long time, but so we should be
- * ready to recheck path in a bad case... We remember, that page->lsn
- * should never be invalid.
+ * Walk down along the path of smallest penalty, updating the parent
+ * pointers with the key we're inserting as we go. If we crash in the
+ * middle, the tree is consistent, although the possible parent updates
+ * were a waste.
*/
for (;;)
{
- if (XLogRecPtrIsInvalid(state->stack->lsn))
- state->stack->buffer = ReadBuffer(state->r, state->stack->blkno);
- LockBuffer(state->stack->buffer, GIST_SHARE);
- gistcheckpage(state->r, state->stack->buffer);
+ if (XLogRecPtrIsInvalid(stack->lsn))
+ stack->buffer = ReadBuffer(state.r, stack->blkno);
+
+ /*
+ * Be optimistic and grab shared lock first. Swap it for an exclusive
+ * lock later if we need to update the page.
+ */
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_SHARE);
+ gistcheckpage(state.r, stack->buffer);
+ }
- state->stack->page = (Page) BufferGetPage(state->stack->buffer);
- opaque = GistPageGetOpaque(state->stack->page);
+ stack->page = (Page) BufferGetPage(stack->buffer);
+ stack->lsn = PageGetLSN(stack->page);
+ Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
- state->stack->lsn = PageGetLSN(state->stack->page);
- Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn));
+ /*
+ * If this page was split but the downlink was never inserted to the
+ * parent because the inserting backend crashed before doing that, fix
+ * that now.
+ */
+ if (GistFollowRight(stack->page))
+ {
+ if (!xlocked)
+ {
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ /* someone might've completed the split when we unlocked */
+ if (!GistFollowRight(stack->page))
+ continue;
+ }
+ gistfixsplit(&state, giststate);
- if (state->stack->blkno != GIST_ROOT_BLKNO &&
- XLByteLT(state->stack->parent->lsn, opaque->nsn))
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+
+ if (stack->blkno != GIST_ROOT_BLKNO &&
+ XLByteLT(stack->parent->lsn,
+ GistPageGetOpaque(stack->page)->nsn))
{
/*
- * caused split non-root page is detected, go up to parent to
- * choose best child
+ * Concurrent split detected. There's no guarantee that the
+ * downlink for this page is consistent with the tuple we're
+ * inserting anymore, so go back to parent and rechoose the best
+ * child.
*/
- UnlockReleaseBuffer(state->stack->buffer);
- state->stack = state->stack->parent;
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
continue;
}
- if (!GistPageIsLeaf(state->stack->page))
+ if (!GistPageIsLeaf(stack->page))
{
/*
- * This is an internal page, so continue to walk down the tree. We
- * find the child node that has the minimum insertion penalty and
- * recursively invoke ourselves to modify that node. Once the
- * recursive call returns, we may need to adjust the parent node
- * for two reasons: the child node split, or the key in this node
- * needs to be adjusted for the newly inserted key below us.
+ * This is an internal page so continue to walk down the tree.
+ * Find the child node that has the minimum insertion penalty.
*/
- GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
-
- state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate);
+ BlockNumber childblkno;
+ IndexTuple newtup;
+ GISTInsertStack *item;
- iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
- idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid);
- item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
- LockBuffer(state->stack->buffer, GIST_UNLOCK);
+ stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate);
+ iid = PageGetItemId(stack->page, stack->childoffnum);
+ idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
+ childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
- item->parent = state->stack;
- item->child = NULL;
- if (state->stack)
- state->stack->child = item;
- state->stack = item;
- }
- else
- {
- /* be carefull, during unlock/lock page may be changed... */
- LockBuffer(state->stack->buffer, GIST_UNLOCK);
- LockBuffer(state->stack->buffer, GIST_EXCLUSIVE);
- state->stack->page = (Page) BufferGetPage(state->stack->buffer);
- opaque = GistPageGetOpaque(state->stack->page);
+ /*
+ * Check that it's not a leftover invalid tuple from pre-9.1
+ */
+ if (GistTupleIsInvalid(idxtuple))
+ ereport(ERROR,
+ (errmsg("index \"%s\" contains an inner tuple marked as invalid",
+ RelationGetRelationName(r)),
+ errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
+ errhint("Please REINDEX it.")));
- if (state->stack->blkno == GIST_ROOT_BLKNO)
+ /*
+ * Check that the key representing the target child node is
+ * consistent with the key we're inserting. Update it if it's not.
+ */
+ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
+ if (newtup)
{
/*
- * the only page can become inner instead of leaf is a root
- * page, so for root we should recheck it
+ * Swap shared lock for an exclusive one. Beware, the page may
+ * change while we unlock/lock the page...
*/
- if (!GistPageIsLeaf(state->stack->page))
+ if (!xlocked)
{
- /*
- * very rarely situation: during unlock/lock index with
- * number of pages = 1 was increased
- */
- LockBuffer(state->stack->buffer, GIST_UNLOCK);
- continue;
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ stack->page = (Page) BufferGetPage(stack->buffer);
+
+ if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn))
+ {
+ /* the page was changed while we unlocked it, retry */
+ continue;
+ }
}
/*
- * we don't need to check root split, because checking
- * leaf/inner is enough to recognize split for root
+ * Update the tuple.
+ *
+ * We still hold the lock after gistinserttuples(), but it
+ * might have to split the page to make the updated tuple fit.
+ * In that case the updated tuple might migrate to the other
+ * half of the split, so we have to go back to the parent and
+ * descend back to the half that's a better fit for the new
+ * tuple.
*/
-
+ if (gistinserttuples(&state, stack, giststate, &newtup, 1,
+ stack->childoffnum, InvalidBuffer))
+ {
+ /*
+ * If this was a root split, the root page continues to be
+ * the parent and the updated tuple went to one of the
+ * child pages, so we just need to retry from the root
+ * page.
+ */
+ if (stack->blkno != GIST_ROOT_BLKNO)
+ {
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ }
+ continue;
+ }
}
- else if (XLByteLT(state->stack->parent->lsn, opaque->nsn))
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ xlocked = false;
+
+ /* descend to the chosen child */
+ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
+ item->blkno = childblkno;
+ item->parent = stack;
+ state.stack = stack = item;
+ }
+ else
+ {
+ /*
+ * Leaf page. Insert the new key. We've already updated all the
+ * parents on the way down, but we might have to split the page if
+ * it doesn't fit. gistinserthere() will take care of that.
+ */
+
+ /*
+ * Swap shared lock for an exclusive one. Be careful, the page may
+ * change while we unlock/lock the page...
+ */
+ if (!xlocked)
{
- /*
- * detecting split during unlock/lock, so we should find
- * better child on parent
- */
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ LockBuffer(stack->buffer, GIST_EXCLUSIVE);
+ xlocked = true;
+ stack->page = (Page) BufferGetPage(stack->buffer);
+ stack->lsn = PageGetLSN(stack->page);
- /* forget buffer */
- UnlockReleaseBuffer(state->stack->buffer);
+ if (stack->blkno == GIST_ROOT_BLKNO)
+ {
+ /*
+ * the only page that can become inner instead of leaf is
+ * the root page, so for root we should recheck it
+ */
+ if (!GistPageIsLeaf(stack->page))
+ {
+ /*
+ * very rare situation: during unlock/lock index with
+ * number of pages = 1 was increased
+ */
+ LockBuffer(stack->buffer, GIST_UNLOCK);
+ xlocked = false;
+ continue;
+ }
- state->stack = state->stack->parent;
- continue;
+ /*
+ * we don't need to check root split, because checking
+ * leaf/inner is enough to recognize split for root
+ */
+ }
+ else if (GistFollowRight(stack->page) ||
+ XLByteLT(stack->parent->lsn,
+ GistPageGetOpaque(stack->page)->nsn))
+ {
+ /*
+ * The page was split while we momentarily unlocked the
+ * page. Go back to parent.
+ */
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
}
- state->stack->lsn = PageGetLSN(state->stack->page);
+ /* now state.stack->(page, buffer and blkno) points to leaf page */
+
+ gistinserttuples(&state, stack, giststate, &itup, 1,
+ InvalidOffsetNumber, InvalidBuffer);
+ LockBuffer(stack->buffer, GIST_UNLOCK);
- /* ok we found a leaf page and it X-locked */
+ /* Release any pins we might still hold before exiting */
+ for (; stack; stack = stack->parent)
+ ReleaseBuffer(stack->buffer);
break;
}
}
-
- /* now state->stack->(page, buffer and blkno) points to leaf page */
}
/*
@@ -650,7 +855,7 @@ gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
*
* returns from the beginning of closest parent;
*
- * To prevent deadlocks, this should lock only one page simultaneously.
+ * To prevent deadlocks, this should lock only one page at a time.
*/
GISTInsertStack *
gistFindPath(Relation r, BlockNumber child)
@@ -685,6 +890,13 @@ gistFindPath(Relation r, BlockNumber child)
top->lsn = PageGetLSN(page);
+ /*
+ * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
+ * downlink. This should not normally happen..
+ */
+ if (GistFollowRight(page))
+ elog(ERROR, "concurrent GiST page split was incomplete");
+
if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) &&
GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
{
@@ -713,8 +925,6 @@ gistFindPath(Relation r, BlockNumber child)
ptr = top;
while (ptr->parent)
{
- /* set child link */
- ptr->parent->child = ptr;
/* move childoffnum.. */
if (ptr == top)
{
@@ -756,17 +966,16 @@ gistFindPath(Relation r, BlockNumber child)
return NULL;
}
-
/*
- * Returns X-locked parent of stack page
+ * Updates the stack so that child->parent is the correct parent of the
+ * child. child->parent must be exclusively locked on entry, and will
+ * remain so at exit, but it might not be the same page anymore.
*/
-
static void
gistFindCorrectParent(Relation r, GISTInsertStack *child)
{
GISTInsertStack *parent = child->parent;
- LockBuffer(parent->buffer, GIST_EXCLUSIVE);
gistcheckpage(r, parent->buffer);
parent->page = (Page) BufferGetPage(parent->buffer);
@@ -838,83 +1047,232 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child)
/* install new chain of parents to stack */
child->parent = parent;
- parent->child = child;
/* make recursive call to normal processing */
+ LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
gistFindCorrectParent(r, child);
}
return;
}
-void
-gistmakedeal(GISTInsertState *state, GISTSTATE *giststate)
+/*
+ * Form a downlink pointer for the page in 'buf'.
+ */
+static IndexTuple
+gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate,
+ GISTInsertStack *stack)
{
- int is_splitted;
- ItemId iid;
- IndexTuple oldtup,
- newtup;
+ Page page = BufferGetPage(buf);
+ OffsetNumber maxoff;
+ OffsetNumber offset;
+ IndexTuple downlink = NULL;
- /* walk up */
- while (true)
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
{
- /*
- * After this call: 1. if child page was splited, then itup contains
- * keys for each page 2. if child page wasn't splited, then itup
- * contains additional for adjustment of current key
- */
+ IndexTuple ituple = (IndexTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
- if (state->stack->parent)
+ if (downlink == NULL)
+ downlink = CopyIndexTuple(ituple);
+ else
{
- /*
- * X-lock parent page before proceed child, gistFindCorrectParent
- * should find and lock it
- */
- gistFindCorrectParent(state->r, state->stack);
+ IndexTuple newdownlink;
+
+ newdownlink = gistgetadjusted(rel, downlink, ituple,
+ giststate);
+ if (newdownlink)
+ downlink = newdownlink;
}
- is_splitted = gistplacetopage(state, giststate);
+ }
+
+ /*
+ * If the page is completely empty, we can't form a meaningful downlink
+ * for it. But we have to insert a downlink for the page. Any key will do,
+ * as long as its consistent with the downlink of parent page, so that we
+ * can legally insert it to the parent. A minimal one that matches as few
+ * scans as possible would be best, to keep scans from doing useless work,
+ * but we don't know how to construct that. So we just use the downlink of
+ * the original page that was split - that's as far from optimal as it can
+ * get but will do..
+ */
+ if (!downlink)
+ {
+ ItemId iid;
- /* parent locked above, so release child buffer */
- UnlockReleaseBuffer(state->stack->buffer);
+ LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+ gistFindCorrectParent(rel, stack);
+ iid = PageGetItemId(stack->parent->page, stack->parent->childoffnum);
+ downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
+ downlink = CopyIndexTuple(downlink);
+ LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+ }
- /* pop parent page from stack */
- state->stack = state->stack->parent;
+ ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf));
+ GistTupleSetValid(downlink);
- /* stack is void */
- if (!state->stack)
- break;
+ return downlink;
+}
- /*
- * child did not split, so we can check is it needed to update parent
- * tuple
- */
- if (!is_splitted)
- {
- /* parent's tuple */
- iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
- oldtup = (IndexTuple) PageGetItem(state->stack->page, iid);
- newtup = gistgetadjusted(state->r, oldtup, state->itup[0], giststate);
-
- if (!newtup)
- { /* not need to update key */
- LockBuffer(state->stack->buffer, GIST_UNLOCK);
- break;
- }
- state->itup[0] = newtup;
+/*
+ * Complete the incomplete split of state->stack->page.
+ */
+static void
+gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
+{
+ GISTInsertStack *stack = state->stack;
+ Buffer buf;
+ Page page;
+ List *splitinfo = NIL;
+
+ elog(LOG, "fixing incomplete split in index \"%s\", block %u",
+ RelationGetRelationName(state->r), stack->blkno);
+
+ Assert(GistFollowRight(stack->page));
+ Assert(OffsetNumberIsValid(stack->parent->childoffnum));
+
+ buf = stack->buffer;
+
+ /*
+ * Read the chain of split pages, following the rightlinks. Construct a
+ * downlink tuple for each page.
+ */
+ for (;;)
+ {
+ GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
+ IndexTuple downlink;
+
+ page = BufferGetPage(buf);
+
+ /* Form the new downlink tuples to insert to parent */
+ downlink = gistformdownlink(state->r, buf, giststate, stack);
+
+ si->buf = buf;
+ si->downlink = downlink;
+
+ splitinfo = lappend(splitinfo, si);
+
+ if (GistFollowRight(page))
+ {
+ /* lock next page */
+ buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
+ LockBuffer(buf, GIST_EXCLUSIVE);
}
- } /* while */
+ else
+ break;
+ }
+
+ /* Insert the downlinks */
+ gistfinishsplit(state, stack, giststate, splitinfo);
+}
+
+/*
+ * Insert tuples to stack->buffer. If 'oldoffnum' is valid, the new tuples
+ * replace an old tuple at oldoffnum. The caller must hold an exclusive lock
+ * on the page.
+ *
+ * If leftchild is valid, we're inserting/updating the downlink for the
+ * page to the right of leftchild. We clear the F_FOLLOW_RIGHT flag and
+ * update NSN on leftchild, atomically with the insertion of the downlink.
+ *
+ * Returns 'true' if the page had to be split. On return, we will continue
+ * to hold an exclusive lock on state->stack->buffer, but if we had to split
+ * the page, it might not contain the tuple we just inserted/updated.
+ */
+static bool
+gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate,
+ IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchild)
+{
+ List *splitinfo;
+ bool is_split;
+
+ is_split = gistplacetopage(state, giststate, stack->buffer,
+ tuples, ntup, oldoffnum,
+ leftchild,
+ &splitinfo);
+ if (splitinfo)
+ gistfinishsplit(state, stack, giststate, splitinfo);
+
+ return is_split;
+}
+
+/*
+ * Finish an incomplete split by inserting/updating the downlinks in
+ * parent page. 'splitinfo' contains all the child pages, exclusively-locked,
+ * involved in the split, from left-to-right.
+ */
+static void
+gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate, List *splitinfo)
+{
+ ListCell *lc;
+ List *reversed;
+ GISTPageSplitInfo *right;
+ GISTPageSplitInfo *left;
+ IndexTuple tuples[2];
+
+ /* A split always contains at least two halves */
+ Assert(list_length(splitinfo) >= 2);
+
+ /*
+ * We need to insert downlinks for each new page, and update the downlink
+ * for the original (leftmost) page in the split. Begin at the rightmost
+ * page, inserting one downlink at a time until there's only two pages
+ * left. Finally insert the downlink for the last new page and update the
+ * downlink for the original page as one operation.
+ */
+
+ /* for convenience, create a copy of the list in reverse order */
+ reversed = NIL;
+ foreach(lc, splitinfo)
+ {
+ reversed = lcons(lfirst(lc), reversed);
+ }
- /* release all parent buffers */
- while (state->stack)
+ LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
+ gistFindCorrectParent(state->r, stack);
+
+ while (list_length(reversed) > 2)
{
- ReleaseBuffer(state->stack->buffer);
- state->stack = state->stack->parent;
+ right = (GISTPageSplitInfo *) linitial(reversed);
+ left = (GISTPageSplitInfo *) lsecond(reversed);
+
+ if (gistinserttuples(state, stack->parent, giststate,
+ &right->downlink, 1,
+ InvalidOffsetNumber,
+ left->buf))
+ {
+ /*
+ * If the parent page was split, need to relocate the original
+ * parent pointer.
+ */
+ gistFindCorrectParent(state->r, stack);
+ }
+ UnlockReleaseBuffer(right->buf);
+ reversed = list_delete_first(reversed);
}
- /* say to xlog that insert is completed */
- if (state->needInsertComplete && !state->r->rd_istemp)
- gistxlogInsertCompletion(state->r->rd_node, &(state->key), 1);
+ right = (GISTPageSplitInfo *) linitial(reversed);
+ left = (GISTPageSplitInfo *) lsecond(reversed);
+
+ /*
+ * Finally insert downlink for the remaining right page and update the
+ * downlink for the original page to not contain the tuples that were
+ * moved to the new pages.
+ */
+ tuples[0] = left->downlink;
+ tuples[1] = right->downlink;
+ gistinserttuples(state, stack->parent, giststate,
+ tuples, 2,
+ stack->parent->childoffnum,
+ left->buf);
+ LockBuffer(stack->parent->buffer, GIST_UNLOCK);
+ UnlockReleaseBuffer(right->buf);
+ Assert(left->buf == stack->buffer);
}
/*
@@ -965,8 +1323,7 @@ gistSplit(Relation r,
ROTATEDIST(res);
res->block.num = v.splitVector.spl_nright;
res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
- res->itup = (v.spl_rightvalid) ? gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false)
- : gist_form_invalid_tuple(GIST_ROOT_BLKNO);
+ res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
}
if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
@@ -988,51 +1345,16 @@ gistSplit(Relation r,
ROTATEDIST(res);
res->block.num = v.splitVector.spl_nleft;
res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
- res->itup = (v.spl_leftvalid) ? gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false)
- : gist_form_invalid_tuple(GIST_ROOT_BLKNO);
+ res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
}
return res;
}
/*
- * buffer must be pinned and locked by caller
+ * Fill a GISTSTATE with information about the index
*/
void
-gistnewroot(Relation r, Buffer buffer, IndexTuple *itup, int len, ItemPointer key)
-{
- Page page;
-
- Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
- page = BufferGetPage(buffer);
-
- START_CRIT_SECTION();
-
- GISTInitBuffer(buffer, 0);
- gistfillbuffer(page, itup, len, FirstOffsetNumber);
-
- MarkBufferDirty(buffer);
-
- if (!r->rd_istemp)
- {
- XLogRecPtr recptr;
- XLogRecData *rdata;
-
- rdata = formUpdateRdata(r->rd_node, buffer,
- NULL, 0,
- itup, len, key);
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_NEW_ROOT, rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
- else
- PageSetLSN(page, XLogRecPtrForTemp);
-
- END_CRIT_SECTION();
-}
-
-void
initGISTstate(GISTSTATE *giststate, Relation index)
{
int i;
@@ -1066,6 +1388,29 @@ initGISTstate(GISTSTATE *giststate, Relation index)
fmgr_info_copy(&(giststate->equalFn[i]),
index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
CurrentMemoryContext);
+ /* opclasses are not required to provide a Distance method */
+ if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
+ fmgr_info_copy(&(giststate->distanceFn[i]),
+ index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
+ CurrentMemoryContext);
+ else
+ giststate->distanceFn[i].fn_oid = InvalidOid;
+
+ /*
+ * If the index column has a specified collation, we should honor that
+ * while doing comparisons. However, we may have a collatable storage
+ * type for a noncollatable indexed data type. If there's no index
+ * collation then specify default collation in case the support
+ * functions need collation. This is harmless if the support
+ * functions don't care about collation, so we just do it
+ * unconditionally. (We could alternatively call get_typcollation,
+ * but that seems like expensive overkill --- there aren't going to be
+ * any cases where a GiST storage type has a nondefault collation.)
+ */
+ if (OidIsValid(index->rd_indcollation[i]))
+ giststate->supportCollation[i] = index->rd_indcollation[i];
+ else
+ giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
}
}
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 216910307a..1aba686844 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -4,11 +4,11 @@
* fetch tuples from a GiST scan.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistget.c,v 1.85 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gist/gistget.c
*
*-------------------------------------------------------------------------
*/
@@ -20,504 +20,568 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
+#include "utils/builtins.h"
#include "utils/memutils.h"
-static OffsetNumber gistfindnext(IndexScanDesc scan, OffsetNumber n);
-static int64 gistnext(IndexScanDesc scan, TIDBitmap *tbm);
-static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan,
- OffsetNumber offset);
-
-static void
-killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr)
+/*
+ * gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
+ *
+ * The index tuple might represent either a heap tuple or a lower index page,
+ * depending on whether the containing page is a leaf page or not.
+ *
+ * On success return for a heap tuple, *recheck_p is set to indicate
+ * whether recheck is needed. We recheck if any of the consistent() functions
+ * request it. recheck is not interesting when examining a non-leaf entry,
+ * since we must visit the lower index page if there's any doubt.
+ *
+ * If we are doing an ordered scan, so->distances[] is filled with distance
+ * data from the distance() functions before returning success.
+ *
+ * We must decompress the key in the IndexTuple before passing it to the
+ * sk_funcs (which actually are the opclass Consistent or Distance methods).
+ *
+ * Note that this function is always invoked in a short-lived memory context,
+ * so we don't need to worry about cleaning up allocated memory, either here
+ * or in the implementation of any Consistent or Distance methods.
+ */
+static bool
+gistindex_keytest(IndexScanDesc scan,
+ IndexTuple tuple,
+ Page page,
+ OffsetNumber offset,
+ bool *recheck_p)
{
- Page p;
- OffsetNumber offset;
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ GISTSTATE *giststate = so->giststate;
+ ScanKey key = scan->keyData;
+ int keySize = scan->numberOfKeys;
+ double *distance_p;
+ Relation r = scan->indexRelation;
- LockBuffer(so->curbuf, GIST_SHARE);
- gistcheckpage(r, so->curbuf);
- p = (Page) BufferGetPage(so->curbuf);
+ *recheck_p = false;
- if (XLByteEQ(so->stack->lsn, PageGetLSN(p)))
+ /*
+ * If it's a leftover invalid tuple from pre-9.1, treat it as a match with
+ * minimum possible distances. This means we'll always follow it to the
+ * referenced page.
+ */
+ if (GistTupleIsInvalid(tuple))
{
- /* page unchanged, so all is simple */
- offset = ItemPointerGetOffsetNumber(iptr);
- ItemIdMarkDead(PageGetItemId(p, offset));
- SetBufferCommitInfoNeedsSave(so->curbuf);
+ int i;
+
+ if (GistPageIsLeaf(page)) /* shouldn't happen */
+ elog(ERROR, "invalid GiST tuple found on leaf page");
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ so->distances[i] = -get_float8_infinity();
+ return true;
}
- else
+
+ /* Check whether it matches according to the Consistent functions */
+ while (keySize > 0)
{
- OffsetNumber maxoff = PageGetMaxOffsetNumber(p);
+ Datum datum;
+ bool isNull;
- for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
- {
- IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset));
+ datum = index_getattr(tuple,
+ key->sk_attno,
+ giststate->tupdesc,
+ &isNull);
- if (ItemPointerEquals(&(ituple->t_tid), iptr))
+ if (key->sk_flags & SK_ISNULL)
+ {
+ /*
+ * On non-leaf page we can't conclude that child hasn't NULL
+ * values because of assumption in GiST: union (VAL, NULL) is VAL.
+ * But if on non-leaf page key IS NULL, then all children are
+ * NULL.
+ */
+ if (key->sk_flags & SK_SEARCHNULL)
{
- /* found */
- ItemIdMarkDead(PageGetItemId(p, offset));
- SetBufferCommitInfoNeedsSave(so->curbuf);
- break;
+ if (GistPageIsLeaf(page) && !isNull)
+ return false;
+ }
+ else
+ {
+ Assert(key->sk_flags & SK_SEARCHNOTNULL);
+ if (isNull)
+ return false;
}
}
- }
+ else if (isNull)
+ {
+ return false;
+ }
+ else
+ {
+ Datum test;
+ bool recheck;
+ GISTENTRY de;
- LockBuffer(so->curbuf, GIST_UNLOCK);
-}
+ gistdentryinit(giststate, key->sk_attno - 1, &de,
+ datum, r, page, offset,
+ FALSE, isNull);
-/*
- * gistgettuple() -- Get the next tuple in the scan
- */
-Datum
-gistgettuple(PG_FUNCTION_ARGS)
-{
- IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
- ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
- GISTScanOpaque so;
- bool res;
+ /*
+ * Call the Consistent function to evaluate the test. The
+ * arguments are the index datum (as a GISTENTRY*), the comparison
+ * datum, the comparison operator's strategy number and subtype
+ * from pg_amop, and the recheck flag.
+ *
+ * (Presently there's no need to pass the subtype since it'll
+ * always be zero, but might as well pass it for possible future
+ * use.)
+ *
+ * We initialize the recheck flag to true (the safest assumption)
+ * in case the Consistent function forgets to set it.
+ */
+ recheck = true;
- so = (GISTScanOpaque) scan->opaque;
+ test = FunctionCall5Coll(&key->sk_func,
+ key->sk_collation,
+ PointerGetDatum(&de),
+ key->sk_argument,
+ Int32GetDatum(key->sk_strategy),
+ ObjectIdGetDatum(key->sk_subtype),
+ PointerGetDatum(&recheck));
- if (dir != ForwardScanDirection)
- elog(ERROR, "GiST doesn't support other scan directions than forward");
+ if (!DatumGetBool(test))
+ return false;
+ *recheck_p |= recheck;
+ }
- /*
- * If we have produced an index tuple in the past and the executor has
- * informed us we need to mark it as "killed", do so now.
- */
- if (scan->kill_prior_tuple && ItemPointerIsValid(&(so->curpos)))
- killtuple(scan->indexRelation, so, &(so->curpos));
+ key++;
+ keySize--;
+ }
- /*
- * Get the next tuple that matches the search key.
- */
- res = (gistnext(scan, NULL) > 0);
+ /* OK, it passes --- now let's compute the distances */
+ key = scan->orderByData;
+ distance_p = so->distances;
+ keySize = scan->numberOfOrderBys;
+ while (keySize > 0)
+ {
+ Datum datum;
+ bool isNull;
- PG_RETURN_BOOL(res);
-}
+ datum = index_getattr(tuple,
+ key->sk_attno,
+ giststate->tupdesc,
+ &isNull);
-Datum
-gistgetbitmap(PG_FUNCTION_ARGS)
-{
- IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
- TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
- int64 ntids;
+ if ((key->sk_flags & SK_ISNULL) || isNull)
+ {
+ /* Assume distance computes as null and sorts to the end */
+ *distance_p = get_float8_infinity();
+ }
+ else
+ {
+ Datum dist;
+ GISTENTRY de;
- ntids = gistnext(scan, tbm);
+ gistdentryinit(giststate, key->sk_attno - 1, &de,
+ datum, r, page, offset,
+ FALSE, isNull);
- PG_RETURN_INT64(ntids);
+ /*
+ * Call the Distance function to evaluate the distance. The
+ * arguments are the index datum (as a GISTENTRY*), the comparison
+ * datum, and the ordering operator's strategy number and subtype
+ * from pg_amop.
+ *
+ * (Presently there's no need to pass the subtype since it'll
+ * always be zero, but might as well pass it for possible future
+ * use.)
+ *
+ * Note that Distance functions don't get a recheck argument. We
+ * can't tolerate lossy distance calculations on leaf tuples;
+ * there is no opportunity to re-sort the tuples afterwards.
+ */
+ dist = FunctionCall4Coll(&key->sk_func,
+ key->sk_collation,
+ PointerGetDatum(&de),
+ key->sk_argument,
+ Int32GetDatum(key->sk_strategy),
+ ObjectIdGetDatum(key->sk_subtype));
+
+ *distance_p = DatumGetFloat8(dist);
+ }
+
+ key++;
+ distance_p++;
+ keySize--;
+ }
+
+ return true;
}
/*
- * Fetch tuple(s) that match the search key; this can be invoked
- * either to fetch the first such tuple or subsequent matching tuples.
+ * Scan all items on the GiST index page identified by *pageItem, and insert
+ * them into the queue (or directly to output areas)
+ *
+ * scan: index scan we are executing
+ * pageItem: search queue item identifying an index page to scan
+ * myDistances: distances array associated with pageItem, or NULL at the root
+ * tbm: if not NULL, gistgetbitmap's output bitmap
+ * ntids: if not NULL, gistgetbitmap's output tuple counter
*
- * This function is used by both gistgettuple and gistgetbitmap. When
- * invoked from gistgettuple, tbm is null and the next matching tuple
- * is returned in scan->xs_ctup.t_self. When invoked from getbitmap,
- * tbm is non-null and all matching tuples are added to tbm before
- * returning. In both cases, the function result is the number of
- * returned tuples.
+ * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
+ * tuples should be reported directly into the bitmap. If they are NULL,
+ * we're doing a plain or ordered indexscan. For a plain indexscan, heap
+ * tuple TIDs are returned into so->pageData[]. For an ordered indexscan,
+ * heap tuple TIDs are pushed into individual search queue items.
*
- * If scan specifies to skip killed tuples, continue looping until we find a
- * non-killed tuple that matches the search key.
+ * If we detect that the index page has split since we saw its downlink
+ * in the parent, we push its new right sibling onto the queue so the
+ * sibling will be processed next.
*/
-static int64
-gistnext(IndexScanDesc scan, TIDBitmap *tbm)
+static void
+gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
+ TIDBitmap *tbm, int64 *ntids)
{
- Page p;
- OffsetNumber n;
- GISTScanOpaque so;
- GISTSearchStack *stk;
- IndexTuple it;
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ Buffer buffer;
+ Page page;
GISTPageOpaque opaque;
- int64 ntids = 0;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ GISTSearchTreeItem *tmpItem = so->tmpTreeItem;
+ bool isNew;
+ MemoryContext oldcxt;
- so = (GISTScanOpaque) scan->opaque;
+ Assert(!GISTSearchItemIsHeap(*pageItem));
- if (so->qual_ok == false)
- return 0;
+ buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
+ LockBuffer(buffer, GIST_SHARE);
+ gistcheckpage(scan->indexRelation, buffer);
+ page = BufferGetPage(buffer);
+ opaque = GistPageGetOpaque(page);
- if (so->curbuf == InvalidBuffer)
+ /*
+ * Check if we need to follow the rightlink. We need to follow it if the
+ * page was concurrently split since we visited the parent (in which case
+ * parentlsn < nsn), or if the the system crashed after a page split but
+ * before the downlink was inserted into the parent.
+ */
+ if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
+ (GistFollowRight(page) ||
+ XLByteLT(pageItem->data.parentlsn, opaque->nsn)) &&
+ opaque->rightlink != InvalidBlockNumber /* sanity check */ )
{
- if (ItemPointerIsValid(&so->curpos) == false)
- {
- /* Being asked to fetch the first entry, so start at the root */
- Assert(so->curbuf == InvalidBuffer);
- Assert(so->stack == NULL);
+ /* There was a page split, follow right link to add pages */
+ GISTSearchItem *item;
- so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO);
+ /* This can't happen when starting at the root */
+ Assert(myDistances != NULL);
- stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack));
+ oldcxt = MemoryContextSwitchTo(so->queueCxt);
- stk->next = NULL;
- stk->block = GIST_ROOT_BLKNO;
+ /* Create new GISTSearchItem for the right sibling index page */
+ item = palloc(sizeof(GISTSearchItem));
+ item->next = NULL;
+ item->blkno = opaque->rightlink;
+ item->data.parentlsn = pageItem->data.parentlsn;
- pgstat_count_index_scan(scan->indexRelation);
- }
- else
- {
- /* scan is finished */
- return 0;
- }
+ /* Insert it into the queue using same distances as for this page */
+ tmpItem->head = item;
+ tmpItem->lastHeap = NULL;
+ memcpy(tmpItem->distances, myDistances,
+ sizeof(double) * scan->numberOfOrderBys);
+
+ (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
+
+ MemoryContextSwitchTo(oldcxt);
}
+ so->nPageData = so->curPageData = 0;
+
/*
- * check stored pointers from last visit
+ * check all tuples on page
*/
- if (so->nPageData > 0)
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
+ IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ bool match;
+ bool recheck;
+
/*
- * gistgetmulti never should go here
+ * Must call gistindex_keytest in tempCxt, and clean up any leftover
+ * junk afterward.
*/
- Assert(tbm == NULL);
+ oldcxt = MemoryContextSwitchTo(so->tempCxt);
- if (so->curPageData < so->nPageData)
- {
- scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr;
- scan->xs_recheck = so->pageData[so->curPageData].recheck;
+ match = gistindex_keytest(scan, it, page, i, &recheck);
- ItemPointerSet(&so->curpos,
- BufferGetBlockNumber(so->curbuf),
- so->pageData[so->curPageData].pageOffset);
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextReset(so->tempCxt);
- so->curPageData++;
+ /* Ignore tuple if it doesn't match */
+ if (!match)
+ continue;
- return 1;
+ if (tbm && GistPageIsLeaf(page))
+ {
+ /*
+ * getbitmap scan, so just push heap tuple TIDs into the bitmap
+ * without worrying about ordering
+ */
+ tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
+ (*ntids)++;
+ }
+ else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
+ {
+ /*
+ * Non-ordered scan, so report heap tuples in so->pageData[]
+ */
+ so->pageData[so->nPageData].heapPtr = it->t_tid;
+ so->pageData[so->nPageData].recheck = recheck;
+ so->nPageData++;
}
else
{
/*
- * Go to the next page
+ * Must push item into search queue. We get here for any lower
+ * index page, and also for heap tuples if doing an ordered
+ * search.
*/
- stk = so->stack->next;
- pfree(so->stack);
- so->stack = stk;
+ GISTSearchItem *item;
- /* If we're out of stack entries, we're done */
- if (so->stack == NULL)
+ oldcxt = MemoryContextSwitchTo(so->queueCxt);
+
+ /* Create new GISTSearchItem for this item */
+ item = palloc(sizeof(GISTSearchItem));
+ item->next = NULL;
+
+ if (GistPageIsLeaf(page))
+ {
+ /* Creating heap-tuple GISTSearchItem */
+ item->blkno = InvalidBlockNumber;
+ item->data.heap.heapPtr = it->t_tid;
+ item->data.heap.recheck = recheck;
+ }
+ else
{
- ReleaseBuffer(so->curbuf);
- so->curbuf = InvalidBuffer;
- return 0;
+ /* Creating index-page GISTSearchItem */
+ item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
+ /* lsn of current page is lsn of parent page for child */
+ item->data.parentlsn = PageGetLSN(page);
}
- so->curbuf = ReleaseAndReadBuffer(so->curbuf,
- scan->indexRelation,
- stk->block);
+ /* Insert it into the queue using new distance data */
+ tmpItem->head = item;
+ tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL;
+ memcpy(tmpItem->distances, so->distances,
+ sizeof(double) * scan->numberOfOrderBys);
+
+ (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
+
+ MemoryContextSwitchTo(oldcxt);
}
}
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Extract next item (in order) from search queue
+ *
+ * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it.
+ *
+ * NOTE: on successful return, so->curTreeItem is the GISTSearchTreeItem that
+ * contained the result item. Callers can use so->curTreeItem->distances as
+ * the distances value for the item.
+ */
+static GISTSearchItem *
+getNextGISTSearchItem(GISTScanOpaque so)
+{
for (;;)
{
- CHECK_FOR_INTERRUPTS();
+ GISTSearchItem *item;
- /* First of all, we need lock buffer */
- Assert(so->curbuf != InvalidBuffer);
- LockBuffer(so->curbuf, GIST_SHARE);
- gistcheckpage(scan->indexRelation, so->curbuf);
- p = BufferGetPage(so->curbuf);
- opaque = GistPageGetOpaque(p);
-
- /* remember lsn to identify page changed for tuple's killing */
- so->stack->lsn = PageGetLSN(p);
-
- /* check page split, occured since visit to parent */
- if (!XLogRecPtrIsInvalid(so->stack->parentlsn) &&
- XLByteLT(so->stack->parentlsn, opaque->nsn) &&
- opaque->rightlink != InvalidBlockNumber /* sanity check */ &&
- (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already
- added */ )
+ /* Update curTreeItem if we don't have one */
+ if (so->curTreeItem == NULL)
{
- /* detect page split, follow right link to add pages */
-
- stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));
- stk->next = so->stack->next;
- stk->block = opaque->rightlink;
- stk->parentlsn = so->stack->parentlsn;
- memset(&(stk->lsn), 0, sizeof(GistNSN));
- so->stack->next = stk;
+ so->curTreeItem = (GISTSearchTreeItem *) rb_leftmost(so->queue);
+ /* Done when tree is empty */
+ if (so->curTreeItem == NULL)
+ break;
}
- /* if page is empty, then just skip it */
- if (PageIsEmpty(p))
+ item = so->curTreeItem->head;
+ if (item != NULL)
{
- LockBuffer(so->curbuf, GIST_UNLOCK);
- stk = so->stack->next;
- pfree(so->stack);
- so->stack = stk;
-
- if (so->stack == NULL)
- {
- ReleaseBuffer(so->curbuf);
- so->curbuf = InvalidBuffer;
- return ntids;
- }
-
- so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation,
- stk->block);
- continue;
+ /* Delink item from chain */
+ so->curTreeItem->head = item->next;
+ if (item == so->curTreeItem->lastHeap)
+ so->curTreeItem->lastHeap = NULL;
+ /* Return item; caller is responsible to pfree it */
+ return item;
}
- n = FirstOffsetNumber;
-
- /* wonderful, we can look at page */
- so->nPageData = so->curPageData = 0;
-
- for (;;)
- {
- n = gistfindnext(scan, n);
-
- if (!OffsetNumberIsValid(n))
- {
- /*
- * If we was called from gistgettuple and current buffer
- * contains something matched then make a recursive call - it
- * will return ItemPointer from so->pageData. But we save
- * buffer pinned to support tuple's killing
- */
- if (!tbm && so->nPageData > 0)
- {
- LockBuffer(so->curbuf, GIST_UNLOCK);
- return gistnext(scan, NULL);
- }
+ /* curTreeItem is exhausted, so remove it from rbtree */
+ rb_delete(so->queue, (RBNode *) so->curTreeItem);
+ so->curTreeItem = NULL;
+ }
- /*
- * We ran out of matching index entries on the current page,
- * so pop the top stack entry and use it to continue the
- * search.
- */
- LockBuffer(so->curbuf, GIST_UNLOCK);
- stk = so->stack->next;
- pfree(so->stack);
- so->stack = stk;
-
- /* If we're out of stack entries, we're done */
-
- if (so->stack == NULL)
- {
- ReleaseBuffer(so->curbuf);
- so->curbuf = InvalidBuffer;
- return ntids;
- }
-
- so->curbuf = ReleaseAndReadBuffer(so->curbuf,
- scan->indexRelation,
- stk->block);
- /* XXX go up */
- break;
- }
+ return NULL;
+}
- if (GistPageIsLeaf(p))
- {
- /*
- * We've found a matching index entry in a leaf page, so
- * return success. Note that we keep "curbuf" pinned so that
- * we can efficiently resume the index scan later.
- */
+/*
+ * Fetch next heap tuple in an ordered search
+ */
+static bool
+getNextNearest(IndexScanDesc scan)
+{
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ bool res = false;
- if (!(scan->ignore_killed_tuples &&
- ItemIdIsDead(PageGetItemId(p, n))))
- {
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- ntids++;
- if (tbm != NULL)
- tbm_add_tuples(tbm, &it->t_tid, 1, scan->xs_recheck);
- else
- {
- so->pageData[so->nPageData].heapPtr = it->t_tid;
- so->pageData[so->nPageData].pageOffset = n;
- so->pageData[so->nPageData].recheck = scan->xs_recheck;
- so->nPageData++;
- }
- }
- }
- else
- {
- /*
- * We've found an entry in an internal node whose key is
- * consistent with the search key, so push it to stack
- */
- stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));
+ do
+ {
+ GISTSearchItem *item = getNextGISTSearchItem(so);
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- stk->block = ItemPointerGetBlockNumber(&(it->t_tid));
- memset(&(stk->lsn), 0, sizeof(GistNSN));
- stk->parentlsn = so->stack->lsn;
+ if (!item)
+ break;
- stk->next = so->stack->next;
- so->stack->next = stk;
- }
+ if (GISTSearchItemIsHeap(*item))
+ {
+ /* found a heap item at currently minimal distance */
+ scan->xs_ctup.t_self = item->data.heap.heapPtr;
+ scan->xs_recheck = item->data.heap.recheck;
+ res = true;
+ }
+ else
+ {
+ /* visit an index page, extract its items into queue */
+ CHECK_FOR_INTERRUPTS();
- n = OffsetNumberNext(n);
+ gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
}
- }
- return ntids;
+ pfree(item);
+ } while (!res);
+
+ return res;
}
/*
- * gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
- *
- * On success return for a leaf tuple, scan->xs_recheck is set to indicate
- * whether recheck is needed. We recheck if any of the consistent() functions
- * request it.
- *
- * We must decompress the key in the IndexTuple before passing it to the
- * sk_func (and we have previously overwritten the sk_func to use the
- * user-defined Consistent method, so we actually are invoking that).
- *
- * Note that this function is always invoked in a short-lived memory context,
- * so we don't need to worry about cleaning up allocated memory, either here
- * or in the implementation of any Consistent methods.
+ * gistgettuple() -- Get the next tuple in the scan
*/
-static bool
-gistindex_keytest(IndexTuple tuple,
- IndexScanDesc scan,
- OffsetNumber offset)
+Datum
+gistgettuple(PG_FUNCTION_ARGS)
{
- int keySize = scan->numberOfKeys;
- ScanKey key = scan->keyData;
- Relation r = scan->indexRelation;
- GISTScanOpaque so;
- Page p;
- GISTSTATE *giststate;
-
- so = (GISTScanOpaque) scan->opaque;
- giststate = so->giststate;
- p = BufferGetPage(so->curbuf);
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
- scan->xs_recheck = false;
+ if (dir != ForwardScanDirection)
+ elog(ERROR, "GiST only supports forward scan direction");
- /*
- * Tuple doesn't restore after crash recovery because of incomplete insert
- */
- if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple))
- return true;
+ if (!so->qual_ok)
+ PG_RETURN_BOOL(false);
- while (keySize > 0)
+ if (so->firstCall)
{
- Datum datum;
- bool isNull;
- Datum test;
- bool recheck;
- GISTENTRY de;
+ /* Begin the scan by processing the root page */
+ GISTSearchItem fakeItem;
- datum = index_getattr(tuple,
- key->sk_attno,
- giststate->tupdesc,
- &isNull);
+ pgstat_count_index_scan(scan->indexRelation);
- if (key->sk_flags & SK_ISNULL)
+ so->firstCall = false;
+ so->curTreeItem = NULL;
+ so->curPageData = so->nPageData = 0;
+
+ fakeItem.blkno = GIST_ROOT_BLKNO;
+ memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+ gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
+ }
+
+ if (scan->numberOfOrderBys > 0)
+ {
+ /* Must fetch tuples in strict distance order */
+ PG_RETURN_BOOL(getNextNearest(scan));
+ }
+ else
+ {
+ /* Fetch tuples index-page-at-a-time */
+ for (;;)
{
- /*
- * On non-leaf page we can't conclude that child hasn't NULL
- * values because of assumption in GiST: union (VAL, NULL) is VAL.
- * But if on non-leaf page key IS NULL, then all children are
- * NULL.
- */
- if (key->sk_flags & SK_SEARCHNULL)
+ if (so->curPageData < so->nPageData)
{
- if (GistPageIsLeaf(p) && !isNull)
- return false;
+ /* continuing to return tuples from a leaf page */
+ scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr;
+ scan->xs_recheck = so->pageData[so->curPageData].recheck;
+ so->curPageData++;
+ PG_RETURN_BOOL(true);
}
- else
+
+ /* find and process the next index page */
+ do
{
- Assert(key->sk_flags & SK_SEARCHNOTNULL);
- if (isNull)
- return false;
- }
- }
- else if (isNull)
- {
- return false;
- }
- else
- {
- gistdentryinit(giststate, key->sk_attno - 1, &de,
- datum, r, p, offset,
- FALSE, isNull);
+ GISTSearchItem *item = getNextGISTSearchItem(so);
- /*
- * Call the Consistent function to evaluate the test. The
- * arguments are the index datum (as a GISTENTRY*), the comparison
- * datum, the comparison operator's strategy number and subtype
- * from pg_amop, and the recheck flag.
- *
- * (Presently there's no need to pass the subtype since it'll
- * always be zero, but might as well pass it for possible future
- * use.)
- *
- * We initialize the recheck flag to true (the safest assumption)
- * in case the Consistent function forgets to set it.
- */
- recheck = true;
+ if (!item)
+ PG_RETURN_BOOL(false);
- test = FunctionCall5(&key->sk_func,
- PointerGetDatum(&de),
- key->sk_argument,
- Int32GetDatum(key->sk_strategy),
- ObjectIdGetDatum(key->sk_subtype),
- PointerGetDatum(&recheck));
+ CHECK_FOR_INTERRUPTS();
- if (!DatumGetBool(test))
- return false;
- scan->xs_recheck |= recheck;
- }
+ /*
+ * While scanning a leaf page, ItemPointers of matching heap
+ * tuples are stored in so->pageData. If there are any on
+ * this page, we fall out of the inner "do" and loop around to
+ * return them.
+ */
+ gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
- keySize--;
- key++;
+ pfree(item);
+ } while (so->nPageData == 0);
+ }
}
- return true;
+ PG_RETURN_BOOL(false); /* keep compiler quiet */
}
/*
- * Return the offset of the first index entry that is consistent with
- * the search key after offset 'n' in the current page. If there are
- * no more consistent entries, return InvalidOffsetNumber.
- * On success, scan->xs_recheck is set correctly, too.
- * Page should be locked....
+ * gistgetbitmap() -- Get a bitmap of all heap tuple locations
*/
-static OffsetNumber
-gistfindnext(IndexScanDesc scan, OffsetNumber n)
+Datum
+gistgetbitmap(PG_FUNCTION_ARGS)
{
- OffsetNumber maxoff;
- IndexTuple it;
- GISTScanOpaque so;
- MemoryContext oldcxt;
- Page p;
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ int64 ntids = 0;
+ GISTSearchItem fakeItem;
- so = (GISTScanOpaque) scan->opaque;
- p = BufferGetPage(so->curbuf);
- maxoff = PageGetMaxOffsetNumber(p);
+ if (!so->qual_ok)
+ PG_RETURN_INT64(0);
+
+ pgstat_count_index_scan(scan->indexRelation);
+
+ /* Begin the scan by processing the root page */
+ so->curTreeItem = NULL;
+ so->curPageData = so->nPageData = 0;
+
+ fakeItem.blkno = GIST_ROOT_BLKNO;
+ memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
+ gistScanPage(scan, &fakeItem, NULL, tbm, &ntids);
/*
- * Make sure we're in a short-lived memory context when we invoke a
- * user-supplied GiST method in gistindex_keytest(), so we don't leak
- * memory
+ * While scanning a leaf page, ItemPointers of matching heap tuples will
+ * be stored directly into tbm, so we don't need to deal with them here.
*/
- oldcxt = MemoryContextSwitchTo(so->tempCxt);
-
- while (n >= FirstOffsetNumber && n <= maxoff)
+ for (;;)
{
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- if (gistindex_keytest(it, scan, n))
+ GISTSearchItem *item = getNextGISTSearchItem(so);
+
+ if (!item)
break;
- n = OffsetNumberNext(n);
- }
+ CHECK_FOR_INTERRUPTS();
- MemoryContextSwitchTo(oldcxt);
- MemoryContextReset(so->tempCxt);
+ gistScanPage(scan, item, so->curTreeItem->distances, tbm, &ntids);
- /*
- * If we found a matching entry, return its offset; otherwise return
- * InvalidOffsetNumber to inform the caller to go to the next page.
- */
- if (n >= FirstOffsetNumber && n <= maxoff)
- return n;
- else
- return InvalidOffsetNumber;
+ pfree(item);
+ }
+
+ PG_RETURN_INT64(ntids);
}
diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c
index cb34b26113..43c4b1251b 100644
--- a/src/backend/access/gist/gistproc.c
+++ b/src/backend/access/gist/gistproc.c
@@ -6,11 +6,11 @@
* This gives R-tree behavior, with Guttman's poly-time split algorithm.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.21 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gist/gistproc.c
*
*-------------------------------------------------------------------------
*/
@@ -380,12 +380,12 @@ gist_box_picksplit(PG_FUNCTION_ARGS)
for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
{
cur = DatumGetBoxP(entryvec->vector[i].key);
- if (allisequal == true && (
- pageunion.high.x != cur->high.x ||
- pageunion.high.y != cur->high.y ||
- pageunion.low.x != cur->low.x ||
- pageunion.low.y != cur->low.y
- ))
+ if (allisequal && (
+ pageunion.high.x != cur->high.x ||
+ pageunion.high.y != cur->high.y ||
+ pageunion.low.x != cur->low.x ||
+ pageunion.low.y != cur->low.y
+ ))
allisequal = false;
adjustBox(&pageunion, cur);
@@ -904,6 +904,76 @@ gist_point_compress(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(entry);
}
+#define point_point_distance(p1,p2) \
+ DatumGetFloat8(DirectFunctionCall2(point_distance, \
+ PointPGetDatum(p1), PointPGetDatum(p2)))
+
+static double
+computeDistance(bool isLeaf, BOX *box, Point *point)
+{
+ double result = 0.0;
+
+ if (isLeaf)
+ {
+ /* simple point to point distance */
+ result = point_point_distance(point, &box->low);
+ }
+ else if (point->x <= box->high.x && point->x >= box->low.x &&
+ point->y <= box->high.y && point->y >= box->low.y)
+ {
+ /* point inside the box */
+ result = 0.0;
+ }
+ else if (point->x <= box->high.x && point->x >= box->low.x)
+ {
+ /* point is over or below box */
+ Assert(box->low.y <= box->high.y);
+ if (point->y > box->high.y)
+ result = point->y - box->high.y;
+ else if (point->y < box->low.y)
+ result = box->low.y - point->y;
+ else
+ elog(ERROR, "inconsistent point values");
+ }
+ else if (point->y <= box->high.y && point->y >= box->low.y)
+ {
+ /* point is to left or right of box */
+ Assert(box->low.x <= box->high.x);
+ if (point->x > box->high.x)
+ result = point->x - box->high.x;
+ else if (point->x < box->low.x)
+ result = box->low.x - point->x;
+ else
+ elog(ERROR, "inconsistent point values");
+ }
+ else
+ {
+ /* closest point will be a vertex */
+ Point p;
+ double subresult;
+
+ result = point_point_distance(point, &box->low);
+
+ subresult = point_point_distance(point, &box->high);
+ if (result > subresult)
+ result = subresult;
+
+ p.x = box->low.x;
+ p.y = box->high.y;
+ subresult = point_point_distance(point, &p);
+ if (result > subresult)
+ result = subresult;
+
+ p.x = box->high.x;
+ p.y = box->low.y;
+ subresult = point_point_distance(point, &p);
+ if (result > subresult)
+ result = subresult;
+ }
+
+ return result;
+}
+
static bool
gist_point_consistent_internal(StrategyNumber strategy,
bool isLeaf, BOX *key, Point *query)
@@ -954,8 +1024,8 @@ gist_point_consistent(PG_FUNCTION_ARGS)
{
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
- bool result;
bool *recheck = (bool *) PG_GETARG_POINTER(4);
+ bool result;
StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
switch (strategyGroup)
@@ -1034,9 +1104,32 @@ gist_point_consistent(PG_FUNCTION_ARGS)
}
break;
default:
- result = false; /* silence compiler warning */
elog(ERROR, "unknown strategy number: %d", strategy);
+ result = false; /* keep compiler quiet */
}
PG_RETURN_BOOL(result);
}
+
+Datum
+gist_point_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ double distance;
+ StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset;
+
+ switch (strategyGroup)
+ {
+ case PointStrategyNumberGroup:
+ distance = computeDistance(GIST_LEAF(entry),
+ DatumGetBoxP(entry->key),
+ PG_GETARG_POINT_P(1));
+ break;
+ default:
+ elog(ERROR, "unknown strategy number: %d", strategy);
+ distance = 0.0; /* keep compiler quiet */
+ }
+
+ PG_RETURN_FLOAT8(distance);
+}
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index a53d8cd087..5662a3a4aa 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -4,11 +4,11 @@
* routines to manage scans on GiST index relations
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.79 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/gist/gistscan.c
*
*-------------------------------------------------------------------------
*/
@@ -20,18 +20,112 @@
#include "access/relscan.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
+#include "utils/rel.h"
-static void gistfreestack(GISTSearchStack *s);
+
+/*
+ * RBTree support functions for the GISTSearchTreeItem queue
+ */
+
+static int
+GISTSearchTreeItemComparator(const RBNode *a, const RBNode *b, void *arg)
+{
+ const GISTSearchTreeItem *sa = (const GISTSearchTreeItem *) a;
+ const GISTSearchTreeItem *sb = (const GISTSearchTreeItem *) b;
+ IndexScanDesc scan = (IndexScanDesc) arg;
+ int i;
+
+ /* Order according to distance comparison */
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ if (sa->distances[i] != sb->distances[i])
+ return (sa->distances[i] > sb->distances[i]) ? 1 : -1;
+ }
+
+ return 0;
+}
+
+static void
+GISTSearchTreeItemCombiner(RBNode *existing, const RBNode *newrb, void *arg)
+{
+ GISTSearchTreeItem *scurrent = (GISTSearchTreeItem *) existing;
+ const GISTSearchTreeItem *snew = (const GISTSearchTreeItem *) newrb;
+ GISTSearchItem *newitem = snew->head;
+
+ /* snew should have just one item in its chain */
+ Assert(newitem && newitem->next == NULL);
+
+ /*
+ * If new item is heap tuple, it goes to front of chain; otherwise insert
+ * it before the first index-page item, so that index pages are visited in
+ * LIFO order, ensuring depth-first search of index pages. See comments
+ * in gist_private.h.
+ */
+ if (GISTSearchItemIsHeap(*newitem))
+ {
+ newitem->next = scurrent->head;
+ scurrent->head = newitem;
+ if (scurrent->lastHeap == NULL)
+ scurrent->lastHeap = newitem;
+ }
+ else if (scurrent->lastHeap == NULL)
+ {
+ newitem->next = scurrent->head;
+ scurrent->head = newitem;
+ }
+ else
+ {
+ newitem->next = scurrent->lastHeap->next;
+ scurrent->lastHeap->next = newitem;
+ }
+}
+
+static RBNode *
+GISTSearchTreeItemAllocator(void *arg)
+{
+ IndexScanDesc scan = (IndexScanDesc) arg;
+
+ return palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
+}
+
+static void
+GISTSearchTreeItemDeleter(RBNode *rb, void *arg)
+{
+ pfree(rb);
+}
+
+
+/*
+ * Index AM API functions for scanning GiST indexes
+ */
Datum
gistbeginscan(PG_FUNCTION_ARGS)
{
Relation r = (Relation) PG_GETARG_POINTER(0);
int nkeys = PG_GETARG_INT32(1);
- ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
+ int norderbys = PG_GETARG_INT32(2);
IndexScanDesc scan;
+ GISTScanOpaque so;
- scan = RelationGetIndexScan(r, nkeys, key);
+ scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+ /* initialize opaque data */
+ so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData));
+ so->queueCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "GiST queue context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ so->tempCxt = createTempGistContext();
+ so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
+ initGISTstate(so->giststate, scan->indexRelation);
+ /* workspaces with size dependent on numberOfOrderBys: */
+ so->tmpTreeItem = palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
+ so->distances = palloc(sizeof(double) * scan->numberOfOrderBys);
+ so->qual_ok = true; /* in case there are zero keys */
+
+ scan->opaque = so;
PG_RETURN_POINTER(scan);
}
@@ -41,42 +135,28 @@ gistrescan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanKey key = (ScanKey) PG_GETARG_POINTER(1);
- GISTScanOpaque so;
+ ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3);
+
+ /* nkeys and norderbys arguments are ignored */
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
int i;
+ MemoryContext oldCxt;
- so = (GISTScanOpaque) scan->opaque;
- if (so != NULL)
- {
- /* rescan an existing indexscan --- reset state */
- gistfreestack(so->stack);
- so->stack = NULL;
- /* drop pins on buffers -- no locks held */
- if (BufferIsValid(so->curbuf))
- {
- ReleaseBuffer(so->curbuf);
- so->curbuf = InvalidBuffer;
- }
- }
- else
- {
- /* initialize opaque data */
- so = (GISTScanOpaque) palloc(sizeof(GISTScanOpaqueData));
- so->stack = NULL;
- so->tempCxt = createTempGistContext();
- so->curbuf = InvalidBuffer;
- so->giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
- initGISTstate(so->giststate, scan->indexRelation);
-
- scan->opaque = so;
- }
+ /* rescan an existing indexscan --- reset state */
+ MemoryContextReset(so->queueCxt);
+ so->curTreeItem = NULL;
- /*
- * Clear all the pointers.
- */
- ItemPointerSetInvalid(&so->curpos);
- so->nPageData = so->curPageData = 0;
+ /* create new, empty RBTree for search queue */
+ oldCxt = MemoryContextSwitchTo(so->queueCxt);
+ so->queue = rb_create(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys,
+ GISTSearchTreeItemComparator,
+ GISTSearchTreeItemCombiner,
+ GISTSearchTreeItemAllocator,
+ GISTSearchTreeItemDeleter,
+ scan);
+ MemoryContextSwitchTo(oldCxt);
- so->qual_ok = true;
+ so->firstCall = true;
/* Update scan key, if a new one is given */
if (key && scan->numberOfKeys > 0)
@@ -85,8 +165,8 @@ gistrescan(PG_FUNCTION_ARGS)
scan->numberOfKeys * sizeof(ScanKeyData));
/*
- * Modify the scan key so that all the Consistent method is called for
- * all comparisons. The original operator is passed to the Consistent
+ * Modify the scan key so that the Consistent method is called for all
+ * comparisons. The original operator is passed to the Consistent
* function in the form of its strategy number, which is available
* from the sk_strategy field, and its subtype from the sk_subtype
* field.
@@ -95,9 +175,11 @@ gistrescan(PG_FUNCTION_ARGS)
* SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we
* assume all indexable operators are strict).
*/
+ so->qual_ok = true;
+
for (i = 0; i < scan->numberOfKeys; i++)
{
- ScanKey skey = &(scan->keyData[i]);
+ ScanKey skey = scan->keyData + i;
skey->sk_func = so->giststate->consistentFn[skey->sk_attno - 1];
@@ -109,6 +191,33 @@ gistrescan(PG_FUNCTION_ARGS)
}
}
+ /* Update order-by key, if a new one is given */
+ if (orderbys && scan->numberOfOrderBys > 0)
+ {
+ memmove(scan->orderByData, orderbys,
+ scan->numberOfOrderBys * sizeof(ScanKeyData));
+
+ /*
+ * Modify the order-by key so that the Distance method is called for
+ * all comparisons. The original operator is passed to the Distance
+ * function in the form of its strategy number, which is available
+ * from the sk_strategy field, and its subtype from the sk_subtype
+ * field.
+ */
+ for (i = 0; i < scan->numberOfOrderBys; i++)
+ {
+ ScanKey skey = scan->orderByData + i;
+
+ skey->sk_func = so->giststate->distanceFn[skey->sk_attno - 1];
+
+ /* Check we actually have a distance function ... */
+ if (!OidIsValid(skey->sk_func.fn_oid))
+ elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
+ GIST_DISTANCE_PROC, skey->sk_attno,
+ RelationGetRelationName(scan->indexRelation));
+ }
+ }
+
PG_RETURN_VOID();
}
@@ -130,33 +239,14 @@ Datum
gistendscan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
- GISTScanOpaque so;
-
- so = (GISTScanOpaque) scan->opaque;
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
- if (so != NULL)
- {
- gistfreestack(so->stack);
- if (so->giststate != NULL)
- freeGISTstate(so->giststate);
- /* drop pins on buffers -- we aren't holding any locks */
- if (BufferIsValid(so->curbuf))
- ReleaseBuffer(so->curbuf);
- MemoryContextDelete(so->tempCxt);
- pfree(scan->opaque);
- }
+ freeGISTstate(so->giststate);
+ MemoryContextDelete(so->queueCxt);
+ MemoryContextDelete(so->tempCxt);
+ pfree(so->tmpTreeItem);
+ pfree(so->distances);
+ pfree(so);
PG_RETURN_VOID();
}
-
-static void
-gistfreestack(GISTSearchStack *s)
-{
- while (s != NULL)
- {
- GISTSearchStack *p = s->next;
-
- pfree(s);
- s = p;
- }
-}
diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c
index 5700e530fe..bd846cecca 100644
--- a/src/backend/access/gist/gistsplit.c
+++ b/src/backend/access/gist/gistsplit.c
@@ -4,11 +4,11 @@
* Split page algorithm
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistsplit.c,v 1.12 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/gist/gistsplit.c
*
*-------------------------------------------------------------------------
*/
@@ -325,16 +325,18 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC
evec->n = v->spl_nleft;
memcpy(evec->vector, entryvec->vector + FirstOffsetNumber,
sizeof(GISTENTRY) * evec->n);
- v->spl_ldatum = FunctionCall2(&giststate->unionFn[attno],
- PointerGetDatum(evec),
- PointerGetDatum(&nbytes));
+ v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&nbytes));
evec->n = v->spl_nright;
memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft,
sizeof(GISTENTRY) * evec->n);
- v->spl_rdatum = FunctionCall2(&giststate->unionFn[attno],
- PointerGetDatum(evec),
- PointerGetDatum(&nbytes));
+ v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&nbytes));
}
/*
@@ -361,9 +363,10 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
sv->spl_ldatum = v->spl_lattr[attno];
sv->spl_rdatum = v->spl_rattr[attno];
- FunctionCall2(&giststate->picksplitFn[attno],
- PointerGetDatum(entryvec),
- PointerGetDatum(sv));
+ FunctionCall2Coll(&giststate->picksplitFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(entryvec),
+ PointerGetDatum(sv));
if (sv->spl_nleft == 0 || sv->spl_nright == 0)
{
@@ -500,58 +503,6 @@ gistSplitHalf(GIST_SPLITVEC *v, int len)
}
/*
- * if it was invalid tuple then we need special processing.
- * We move all invalid tuples on right page.
- *
- * if there is no place on left page, gistSplit will be called one more
- * time for left page.
- *
- * Normally, we never exec this code, but after crash replay it's possible
- * to get 'invalid' tuples (probability is low enough)
- */
-static void
-gistSplitByInvalid(GISTSTATE *giststate, GistSplitVector *v, IndexTuple *itup, int len)
-{
- int i;
- static OffsetNumber offInvTuples[MaxOffsetNumber];
- int nOffInvTuples = 0;
-
- for (i = 1; i <= len; i++)
- if (GistTupleIsInvalid(itup[i - 1]))
- offInvTuples[nOffInvTuples++] = i;
-
- if (nOffInvTuples == len)
- {
- /* corner case, all tuples are invalid */
- v->spl_rightvalid = v->spl_leftvalid = false;
- gistSplitHalf(&v->splitVector, len);
- }
- else
- {
- GistSplitUnion gsvp;
-
- v->splitVector.spl_right = offInvTuples;
- v->splitVector.spl_nright = nOffInvTuples;
- v->spl_rightvalid = false;
-
- v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber));
- v->splitVector.spl_nleft = 0;
- for (i = 1; i <= len; i++)
- if (!GistTupleIsInvalid(itup[i - 1]))
- v->splitVector.spl_left[v->splitVector.spl_nleft++] = i;
- v->spl_leftvalid = true;
-
- gsvp.equiv = NULL;
- gsvp.attr = v->spl_lattr;
- gsvp.len = v->splitVector.spl_nleft;
- gsvp.entries = v->splitVector.spl_left;
- gsvp.isnull = v->spl_lisnull;
-
- gistunionsubkeyvec(giststate, itup, &gsvp, 0);
- }
-}
-
-/*
* trys to split page by attno key, in a case of null
* values move its to separate page.
*/
@@ -568,12 +519,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist
Datum datum;
bool IsNull;
- if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1]))
- {
- gistSplitByInvalid(giststate, v, itup, len);
- return;
- }
-
datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull);
gistdentryinit(giststate, attno, &(entryvec->vector[i]),
datum, r, page, i,
@@ -582,8 +527,6 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *gist
offNullTuples[nOffNullTuples++] = i;
}
- v->spl_leftvalid = v->spl_rightvalid = true;
-
if (nOffNullTuples == len)
{
/*
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 03c5773d4d..1754a10369 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -4,15 +4,17 @@
* utilities routines for the postgres GiST index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.35 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/gist/gistutil.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
+#include <math.h>
+
#include "access/gist_private.h"
#include "access/reloptions.h"
#include "storage/freespace.h"
@@ -152,7 +154,7 @@ gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
* invalid tuple. Resulting Datums aren't compressed.
*/
-bool
+void
gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey,
Datum *attr, bool *isnull)
{
@@ -180,10 +182,6 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke
Datum datum;
bool IsNull;
- if (GistTupleIsInvalid(itvec[j]))
- return FALSE; /* signals that union with invalid tuple =>
- * result is invalid */
-
datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull);
if (IsNull)
continue;
@@ -211,15 +209,14 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startke
}
/* Make union and store in attr array */
- attr[i] = FunctionCall2(&giststate->unionFn[i],
- PointerGetDatum(evec),
- PointerGetDatum(&attrsize));
+ attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
+ giststate->supportCollation[i],
+ PointerGetDatum(evec),
+ PointerGetDatum(&attrsize));
isnull[i] = FALSE;
}
}
-
- return TRUE;
}
/*
@@ -231,8 +228,7 @@ gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
{
memset(isnullS, TRUE, sizeof(bool) * giststate->tupdesc->natts);
- if (!gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS))
- return gist_form_invalid_tuple(InvalidBlockNumber);
+ gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS);
return gistFormTuple(giststate, r, attrS, isnullS, false);
}
@@ -278,9 +274,10 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno,
}
*dstisnull = FALSE;
- *dst = FunctionCall2(&giststate->unionFn[attno],
- PointerGetDatum(evec),
- PointerGetDatum(&dstsize));
+ *dst = FunctionCall2Coll(&giststate->unionFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(evec),
+ PointerGetDatum(&dstsize));
}
}
@@ -289,9 +286,10 @@ gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
{
bool result;
- FunctionCall3(&giststate->equalFn[attno],
- a, b,
- PointerGetDatum(&result));
+ FunctionCall3Coll(&giststate->equalFn[attno],
+ giststate->supportCollation[attno],
+ a, b,
+ PointerGetDatum(&result));
return result;
}
@@ -328,9 +326,6 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis
IndexTuple newtup = NULL;
int i;
- if (GistTupleIsInvalid(oldtup) || GistTupleIsInvalid(addtup))
- return gist_form_invalid_tuple(ItemPointerGetBlockNumber(&(oldtup->t_tid)));
-
gistDeCompressAtt(giststate, r, oldtup, NULL,
(OffsetNumber) 0, oldentries, oldisnull);
@@ -401,14 +396,6 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
int j;
IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
- if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup))
- {
- ereport(LOG,
- (errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery",
- RelationGetRelationName(r))));
- continue;
- }
-
sum_grow = 0;
for (j = 0; j < r->rd_att->natts; j++)
{
@@ -460,8 +447,9 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
gistentryinit(*e, k, r, pg, o, l);
dep = (GISTENTRY *)
- DatumGetPointer(FunctionCall1(&giststate->decompressFn[nkey],
- PointerGetDatum(e)));
+ DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
+ giststate->supportCollation[nkey],
+ PointerGetDatum(e)));
/* decompressFn may just return the given pointer */
if (dep != e)
gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
@@ -486,8 +474,9 @@ gistcentryinit(GISTSTATE *giststate, int nkey,
gistentryinit(*e, k, r, pg, o, l);
cep = (GISTENTRY *)
- DatumGetPointer(FunctionCall1(&giststate->compressFn[nkey],
- PointerGetDatum(e)));
+ DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey],
+ giststate->supportCollation[nkey],
+ PointerGetDatum(e)));
/* compressFn may just return the given pointer */
if (cep != e)
gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset,
@@ -521,7 +510,12 @@ gistFormTuple(GISTSTATE *giststate, Relation r,
}
res = index_form_tuple(giststate->tupdesc, compatt, isnull);
- GistTupleSetValid(res);
+
+ /*
+ * The offset number on tuples on internal pages is unused. For historical
+ * reasons, it is set 0xffff.
+ */
+ ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
return res;
}
@@ -532,16 +526,23 @@ gistpenalty(GISTSTATE *giststate, int attno,
{
float penalty = 0.0;
- if (giststate->penaltyFn[attno].fn_strict == FALSE || (isNullOrig == FALSE && isNullAdd == FALSE))
- FunctionCall3(&giststate->penaltyFn[attno],
- PointerGetDatum(orig),
- PointerGetDatum(add),
- PointerGetDatum(&penalty));
+ if (giststate->penaltyFn[attno].fn_strict == FALSE ||
+ (isNullOrig == FALSE && isNullAdd == FALSE))
+ {
+ FunctionCall3Coll(&giststate->penaltyFn[attno],
+ giststate->supportCollation[attno],
+ PointerGetDatum(orig),
+ PointerGetDatum(add),
+ PointerGetDatum(&penalty));
+ /* disallow negative or NaN penalty */
+ if (isnan(penalty) || penalty < 0.0)
+ penalty = 0.0;
+ }
else if (isNullOrig && isNullAdd)
penalty = 0.0;
else
- penalty = 1e10; /* try to prevent to mix null and non-null
- * value */
+ penalty = 1e10; /* try to prevent mixing null and non-null
+ * values */
return penalty;
}
@@ -677,3 +678,24 @@ gistoptions(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(result);
PG_RETURN_NULL();
}
+
+/*
+ * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
+ * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
+ * sequence of LSNs for that purpose. Each call generates an LSN that is
+ * greater than any previous value returned by this function in the same
+ * session.
+ */
+XLogRecPtr
+GetXLogRecPtrForTemp(void)
+{
+ static XLogRecPtr counter = {0, 1};
+
+ counter.xrecoff++;
+ if (counter.xrecoff == 0)
+ {
+ counter.xlogid++;
+ counter.xrecoff++;
+ }
+ return counter;
+}
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index abd3d99956..33e6f34154 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -4,11 +4,11 @@
* vacuuming routines for the postgres GiST index access method.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.48 2010/02/08 05:17:31 tgl Exp $
+ * src/backend/access/gist/gistvacuum.c
*
*-------------------------------------------------------------------------
*/
@@ -26,13 +26,6 @@
#include "utils/memutils.h"
-typedef struct GistBulkDeleteResult
-{
- IndexBulkDeleteResult std; /* common state */
- bool needReindex;
-} GistBulkDeleteResult;
-
-
/*
* VACUUM cleanup: update FSM
*/
@@ -40,13 +33,11 @@ Datum
gistvacuumcleanup(PG_FUNCTION_ARGS)
{
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
- GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1);
+ IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
Relation rel = info->index;
BlockNumber npages,
blkno;
BlockNumber totFreePages;
- BlockNumber lastBlock = GIST_ROOT_BLKNO,
- lastFilledBlock = GIST_ROOT_BLKNO;
bool needLock;
/* No-op in ANALYZE ONLY mode */
@@ -56,10 +47,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
/* Set up all-zero stats if gistbulkdelete wasn't called */
if (stats == NULL)
{
- stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/* use heap's tuple count */
- stats->std.num_index_tuples = info->num_heap_tuples;
- stats->std.estimated_count = info->estimated_count;
+ stats->num_index_tuples = info->num_heap_tuples;
+ stats->estimated_count = info->estimated_count;
/*
* XXX the above is wrong if index is partial. Would it be OK to just
@@ -67,11 +58,6 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
*/
}
- if (stats->needReindex)
- ereport(NOTICE,
- (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery",
- RelationGetRelationName(rel))));
-
/*
* Need lock unless it's local to this backend.
*/
@@ -102,20 +88,17 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
totFreePages++;
RecordFreeIndexPage(rel, blkno);
}
- else
- lastFilledBlock = blkno;
UnlockReleaseBuffer(buffer);
}
- lastBlock = npages - 1;
/* Finally, vacuum the FSM */
IndexFreeSpaceMapVacuum(info->index);
/* return statistics */
- stats->std.pages_free = totFreePages;
+ stats->pages_free = totFreePages;
if (needLock)
LockRelationForExtension(rel, ExclusiveLock);
- stats->std.num_pages = RelationGetNumberOfBlocks(rel);
+ stats->num_pages = RelationGetNumberOfBlocks(rel);
if (needLock)
UnlockRelationForExtension(rel, ExclusiveLock);
@@ -135,7 +118,7 @@ pushStackIfSplited(Page page, GistBDItem *stack)
GISTPageOpaque opaque = GistPageGetOpaque(page);
if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
- XLByteLT(stack->parentlsn, opaque->nsn) &&
+ (GistFollowRight(page) || XLByteLT(stack->parentlsn, opaque->nsn)) &&
opaque->rightlink != InvalidBlockNumber /* sanity check */ )
{
/* split page detected, install right link to the stack */
@@ -162,7 +145,7 @@ Datum
gistbulkdelete(PG_FUNCTION_ARGS)
{
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
- GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1);
+ IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
void *callback_state = (void *) PG_GETARG_POINTER(3);
Relation rel = info->index;
@@ -171,10 +154,10 @@ gistbulkdelete(PG_FUNCTION_ARGS)
/* first time through? */
if (stats == NULL)
- stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/* we'll re-count the tuples each time */
- stats->std.estimated_count = false;
- stats->std.num_index_tuples = 0;
+ stats->estimated_count = false;
+ stats->num_index_tuples = 0;
stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
stack->blkno = GIST_ROOT_BLKNO;
@@ -232,10 +215,10 @@ gistbulkdelete(PG_FUNCTION_ARGS)
{
todelete[ntodelete] = i - ntodelete;
ntodelete++;
- stats->std.tuples_removed += 1;
+ stats->tuples_removed += 1;
}
else
- stats->std.num_index_tuples += 1;
+ stats->num_index_tuples += 1;
}
if (ntodelete)
@@ -248,27 +231,18 @@ gistbulkdelete(PG_FUNCTION_ARGS)
PageIndexTupleDelete(page, todelete[i]);
GistMarkTuplesDeleted(page);
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
- XLogRecData *rdata;
XLogRecPtr recptr;
- gistxlogPageUpdate *xlinfo;
- rdata = formUpdateRdata(rel->rd_node, buffer,
+ recptr = gistXLogUpdate(rel->rd_node, buffer,
todelete, ntodelete,
- NULL, 0,
- NULL);
- xlinfo = (gistxlogPageUpdate *) rdata->next->data;
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
+ NULL, 0, InvalidBuffer);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
-
- pfree(xlinfo);
- pfree(rdata);
}
else
- PageSetLSN(page, XLogRecPtrForTemp);
+ PageSetLSN(page, GetXLogRecPtrForTemp());
END_CRIT_SECTION();
}
@@ -293,7 +267,11 @@ gistbulkdelete(PG_FUNCTION_ARGS)
stack->next = ptr;
if (GistTupleIsInvalid(idxtuple))
- stats->needReindex = true;
+ ereport(LOG,
+ (errmsg("index \"%s\" contains an inner tuple marked as invalid",
+ RelationGetRelationName(rel)),
+ errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
+ errhint("Please REINDEX it.")));
}
}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7f5dd990c8..02c4ec3a6f 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -4,11 +4,11 @@
* WAL replay logic for GiST.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.35 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/gist/gistxlog.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
@@ -20,15 +20,6 @@
#include "utils/memutils.h"
#include "utils/rel.h"
-
-typedef struct
-{
- gistxlogPageUpdate *data;
- int len;
- IndexTuple *itup;
- OffsetNumber *todelete;
-} PageUpdateRecord;
-
typedef struct
{
gistxlogPage *header;
@@ -41,144 +32,37 @@ typedef struct
NewPage *page;
} PageSplitRecord;
-/* track for incomplete inserts, idea was taken from nbtxlog.c */
-
-typedef struct gistIncompleteInsert
-{
- RelFileNode node;
- BlockNumber origblkno; /* for splits */
- ItemPointerData key;
- int lenblk;
- BlockNumber *blkno;
- XLogRecPtr lsn;
- BlockNumber *path;
- int pathlen;
-} gistIncompleteInsert;
-
-
static MemoryContext opCtx; /* working memory for operations */
-static MemoryContext insertCtx; /* holds incomplete_inserts list */
-static List *incomplete_inserts;
-
-
-#define ItemPointerEQ(a, b) \
- ( ItemPointerGetOffsetNumber(a) == ItemPointerGetOffsetNumber(b) && \
- ItemPointerGetBlockNumber (a) == ItemPointerGetBlockNumber(b) )
-
+/*
+ * Replay the clearing of F_FOLLOW_RIGHT flag.
+ */
static void
-pushIncompleteInsert(RelFileNode node, XLogRecPtr lsn, ItemPointerData key,
- BlockNumber *blkno, int lenblk,
- PageSplitRecord *xlinfo /* to extract blkno info */ )
+gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn,
+ BlockNumber leftblkno)
{
- MemoryContext oldCxt;
- gistIncompleteInsert *ninsert;
+ Buffer buffer;
- if (!ItemPointerIsValid(&key))
+ buffer = XLogReadBuffer(node, leftblkno, false);
+ if (BufferIsValid(buffer))
+ {
+ Page page = (Page) BufferGetPage(buffer);
/*
- * if key is null then we should not store insertion as incomplete,
- * because it's a vacuum operation..
+ * Note that we still update the page even if page LSN is equal to the
+ * LSN of this record, because the updated NSN is not included in the
+ * full page image.
*/
- return;
-
- oldCxt = MemoryContextSwitchTo(insertCtx);
- ninsert = (gistIncompleteInsert *) palloc(sizeof(gistIncompleteInsert));
-
- ninsert->node = node;
- ninsert->key = key;
- ninsert->lsn = lsn;
-
- if (lenblk && blkno)
- {
- ninsert->lenblk = lenblk;
- ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk);
- memcpy(ninsert->blkno, blkno, sizeof(BlockNumber) * ninsert->lenblk);
- ninsert->origblkno = *blkno;
- }
- else
- {
- int i;
-
- Assert(xlinfo);
- ninsert->lenblk = xlinfo->data->npage;
- ninsert->blkno = (BlockNumber *) palloc(sizeof(BlockNumber) * ninsert->lenblk);
- for (i = 0; i < ninsert->lenblk; i++)
- ninsert->blkno[i] = xlinfo->page[i].header->blkno;
- ninsert->origblkno = xlinfo->data->origblkno;
- }
- Assert(ninsert->lenblk > 0);
-
- /*
- * Stick the new incomplete insert onto the front of the list, not the
- * back. This is so that gist_xlog_cleanup will process incompletions in
- * last-in-first-out order.
- */
- incomplete_inserts = lcons(ninsert, incomplete_inserts);
-
- MemoryContextSwitchTo(oldCxt);
-}
-
-static void
-forgetIncompleteInsert(RelFileNode node, ItemPointerData key)
-{
- ListCell *l;
-
- if (!ItemPointerIsValid(&key))
- return;
-
- if (incomplete_inserts == NIL)
- return;
-
- foreach(l, incomplete_inserts)
- {
- gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l);
-
- if (RelFileNodeEquals(node, insert->node) && ItemPointerEQ(&(insert->key), &(key)))
+ if (!XLByteLT(lsn, PageGetLSN(page)))
{
- /* found */
- incomplete_inserts = list_delete_ptr(incomplete_inserts, insert);
- pfree(insert->blkno);
- pfree(insert);
- break;
- }
- }
-}
+ GistPageGetOpaque(page)->nsn = lsn;
+ GistClearFollowRight(page);
-static void
-decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record)
-{
- char *begin = XLogRecGetData(record),
- *ptr;
- int i = 0,
- addpath = 0;
-
- decoded->data = (gistxlogPageUpdate *) begin;
-
- if (decoded->data->ntodelete)
- {
- decoded->todelete = (OffsetNumber *) (begin + sizeof(gistxlogPageUpdate) + addpath);
- addpath = MAXALIGN(sizeof(OffsetNumber) * decoded->data->ntodelete);
- }
- else
- decoded->todelete = NULL;
-
- decoded->len = 0;
- ptr = begin + sizeof(gistxlogPageUpdate) + addpath;
- while (ptr - begin < record->xl_len)
- {
- decoded->len++;
- ptr += IndexTupleSize((IndexTuple) ptr);
- }
-
- decoded->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * decoded->len);
-
- ptr = begin + sizeof(gistxlogPageUpdate) + addpath;
- while (ptr - begin < record->xl_len)
- {
- decoded->itup[i] = (IndexTuple) ptr;
- ptr += IndexTupleSize(decoded->itup[i]);
- i++;
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
}
}
@@ -186,29 +70,22 @@ decodePageUpdateRecord(PageUpdateRecord *decoded, XLogRecord *record)
* redo any page update (except page split)
*/
static void
-gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot)
+gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record)
{
- gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record);
- PageUpdateRecord xlrec;
+ char *begin = XLogRecGetData(record);
+ gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin;
Buffer buffer;
Page page;
+ char *data;
- /* we must fix incomplete_inserts list even if XLR_BKP_BLOCK_1 is set */
- forgetIncompleteInsert(xldata->node, xldata->key);
+ if (BlockNumberIsValid(xldata->leftchild))
+ gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild);
- if (!isnewroot && xldata->blkno != GIST_ROOT_BLKNO)
- /* operation with root always finalizes insertion */
- pushIncompleteInsert(xldata->node, lsn, xldata->key,
- &(xldata->blkno), 1,
- NULL);
-
- /* nothing else to do if page was backed up (and no info to do it with) */
+ /* nothing more to do if page was backed up (and no info to do it with) */
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
- decodePageUpdateRecord(&xlrec, record);
-
- buffer = XLogReadBuffer(xlrec.data->node, xlrec.data->blkno, false);
+ buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
@@ -219,28 +96,52 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record, bool isnewroot)
return;
}
- if (isnewroot)
- GISTInitBuffer(buffer, 0);
- else if (xlrec.data->ntodelete)
+ data = begin + sizeof(gistxlogPageUpdate);
+
+ /* Delete old tuples */
+ if (xldata->ntodelete > 0)
{
int i;
+ OffsetNumber *todelete = (OffsetNumber *) data;
+
+ data += sizeof(OffsetNumber) * xldata->ntodelete;
- for (i = 0; i < xlrec.data->ntodelete; i++)
- PageIndexTupleDelete(page, xlrec.todelete[i]);
+ for (i = 0; i < xldata->ntodelete; i++)
+ PageIndexTupleDelete(page, todelete[i]);
if (GistPageIsLeaf(page))
GistMarkTuplesDeleted(page);
}
/* add tuples */
- if (xlrec.len > 0)
- gistfillbuffer(page, xlrec.itup, xlrec.len, InvalidOffsetNumber);
+ if (data - begin < record->xl_len)
+ {
+ OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+ OffsetNumberNext(PageGetMaxOffsetNumber(page));
- /*
- * special case: leafpage, nothing to insert, nothing to delete, then
- * vacuum marks page
- */
- if (GistPageIsLeaf(page) && xlrec.len == 0 && xlrec.data->ntodelete == 0)
- GistClearTuplesDeleted(page);
+ while (data - begin < record->xl_len)
+ {
+ IndexTuple itup = (IndexTuple) data;
+ Size sz = IndexTupleSize(itup);
+ OffsetNumber l;
+
+ data += sz;
+
+ l = PageAddItem(page, (Item) itup, sz, off, false, false);
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to GiST index page, size %d bytes",
+ (int) sz);
+ off++;
+ }
+ }
+ else
+ {
+ /*
+ * special case: leafpage, nothing to insert, nothing to delete, then
+ * vacuum marks page
+ */
+ if (GistPageIsLeaf(page) && xldata->ntodelete == 0)
+ GistClearTuplesDeleted(page);
+ }
if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO)
@@ -315,41 +216,67 @@ decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record)
static void
gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
{
+ gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
PageSplitRecord xlrec;
Buffer buffer;
Page page;
int i;
- int flags;
+ bool isrootsplit = false;
+ if (BlockNumberIsValid(xldata->leftchild))
+ gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild);
decodePageSplitRecord(&xlrec, record);
- flags = xlrec.data->origleaf ? F_LEAF : 0;
/* loop around all pages */
for (i = 0; i < xlrec.data->npage; i++)
{
NewPage *newpage = xlrec.page + i;
+ int flags;
+
+ if (newpage->header->blkno == GIST_ROOT_BLKNO)
+ {
+ Assert(i == 0);
+ isrootsplit = true;
+ }
buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true);
Assert(BufferIsValid(buffer));
page = (Page) BufferGetPage(buffer);
/* ok, clear buffer */
+ if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO)
+ flags = F_LEAF;
+ else
+ flags = 0;
GISTInitBuffer(buffer, flags);
/* and fill it */
gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber);
+ if (newpage->header->blkno == GIST_ROOT_BLKNO)
+ {
+ GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
+ GistPageGetOpaque(page)->nsn = xldata->orignsn;
+ GistClearFollowRight(page);
+ }
+ else
+ {
+ if (i < xlrec.data->npage - 1)
+ GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno;
+ else
+ GistPageGetOpaque(page)->rightlink = xldata->origrlink;
+ GistPageGetOpaque(page)->nsn = xldata->orignsn;
+ if (i < xlrec.data->npage - 1 && !isrootsplit)
+ GistMarkFollowRight(page);
+ else
+ GistClearFollowRight(page);
+ }
+
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
-
- forgetIncompleteInsert(xlrec.data->node, xlrec.data->key);
-
- pushIncompleteInsert(xlrec.data->node, lsn, xlrec.data->key,
- NULL, 0,
- &xlrec);
}
static void
@@ -372,24 +299,6 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
UnlockReleaseBuffer(buffer);
}
-static void
-gistRedoCompleteInsert(XLogRecPtr lsn, XLogRecord *record)
-{
- char *begin = XLogRecGetData(record),
- *ptr;
- gistxlogInsertComplete *xlrec;
-
- xlrec = (gistxlogInsertComplete *) begin;
-
- ptr = begin + sizeof(gistxlogInsertComplete);
- while (ptr - begin < record->xl_len)
- {
- Assert(record->xl_len - (ptr - begin) >= sizeof(ItemPointerData));
- forgetIncompleteInsert(xlrec->node, *((ItemPointerData *) ptr));
- ptr += sizeof(ItemPointerData);
- }
-}
-
void
gist_redo(XLogRecPtr lsn, XLogRecord *record)
{
@@ -397,34 +306,27 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
MemoryContext oldCxt;
/*
- * GIST indexes do not require any conflict processing. NB: If we ever
+ * GiST indexes do not require any conflict processing. NB: If we ever
* implement a similar optimization we have in b-tree, and remove killed
* tuples outside VACUUM, we'll need to handle that here.
*/
-
RestoreBkpBlocks(lsn, record, false);
oldCxt = MemoryContextSwitchTo(opCtx);
switch (info)
{
case XLOG_GIST_PAGE_UPDATE:
- gistRedoPageUpdateRecord(lsn, record, false);
+ gistRedoPageUpdateRecord(lsn, record);
break;
case XLOG_GIST_PAGE_DELETE:
gistRedoPageDeleteRecord(lsn, record);
break;
- case XLOG_GIST_NEW_ROOT:
- gistRedoPageUpdateRecord(lsn, record, true);
- break;
case XLOG_GIST_PAGE_SPLIT:
gistRedoPageSplitRecord(lsn, record);
break;
case XLOG_GIST_CREATE_INDEX:
gistRedoCreateIndex(lsn, record);
break;
- case XLOG_GIST_INSERT_COMPLETE:
- gistRedoCompleteInsert(lsn, record);
- break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
}
@@ -434,20 +336,16 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
}
static void
-out_target(StringInfo buf, RelFileNode node, ItemPointerData key)
+out_target(StringInfo buf, RelFileNode node)
{
appendStringInfo(buf, "rel %u/%u/%u",
node.spcNode, node.dbNode, node.relNode);
- if (ItemPointerIsValid(&key))
- appendStringInfo(buf, "; tid %u/%u",
- ItemPointerGetBlockNumber(&key),
- ItemPointerGetOffsetNumber(&key));
}
static void
out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
{
- out_target(buf, xlrec->node, xlrec->key);
+ out_target(buf, xlrec->node);
appendStringInfo(buf, "; block number %u", xlrec->blkno);
}
@@ -463,7 +361,7 @@ static void
out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
{
appendStringInfo(buf, "page_split: ");
- out_target(buf, xlrec->node, xlrec->key);
+ out_target(buf, xlrec->node);
appendStringInfo(buf, "; block number %u splits to %d pages",
xlrec->origblkno, xlrec->npage);
}
@@ -482,10 +380,6 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec)
case XLOG_GIST_PAGE_DELETE:
out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec);
break;
- case XLOG_GIST_NEW_ROOT:
- appendStringInfo(buf, "new_root: ");
- out_target(buf, ((gistxlogPageUpdate *) rec)->node, ((gistxlogPageUpdate *) rec)->key);
- break;
case XLOG_GIST_PAGE_SPLIT:
out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
break;
@@ -495,415 +389,102 @@ gist_desc(StringInfo buf, uint8 xl_info, char *rec)
((RelFileNode *) rec)->dbNode,
((RelFileNode *) rec)->relNode);
break;
- case XLOG_GIST_INSERT_COMPLETE:
- appendStringInfo(buf, "complete_insert: rel %u/%u/%u",
- ((gistxlogInsertComplete *) rec)->node.spcNode,
- ((gistxlogInsertComplete *) rec)->node.dbNode,
- ((gistxlogInsertComplete *) rec)->node.relNode);
- break;
default:
appendStringInfo(buf, "unknown gist op code %u", info);
break;
}
}
-IndexTuple
-gist_form_invalid_tuple(BlockNumber blkno)
-{
- /*
- * we don't alloc space for null's bitmap, this is invalid tuple, be
- * carefull in read and write code
- */
- Size size = IndexInfoFindDataOffset(0);
- IndexTuple tuple = (IndexTuple) palloc0(size);
-
- tuple->t_info |= size;
-
- ItemPointerSetBlockNumber(&(tuple->t_tid), blkno);
- GistTupleSetInvalid(tuple);
-
- return tuple;
-}
-
-
-static void
-gistxlogFindPath(Relation index, gistIncompleteInsert *insert)
-{
- GISTInsertStack *top;
-
- insert->pathlen = 0;
- insert->path = NULL;
-
- if ((top = gistFindPath(index, insert->origblkno)) != NULL)
- {
- int i;
- GISTInsertStack *ptr;
-
- for (ptr = top; ptr; ptr = ptr->parent)
- insert->pathlen++;
-
- insert->path = (BlockNumber *) palloc(sizeof(BlockNumber) * insert->pathlen);
-
- i = 0;
- for (ptr = top; ptr; ptr = ptr->parent)
- insert->path[i++] = ptr->blkno;
- }
- else
- elog(ERROR, "lost parent for block %u", insert->origblkno);
-}
-
-static SplitedPageLayout *
-gistMakePageLayout(Buffer *buffers, int nbuffers)
-{
- SplitedPageLayout *res = NULL,
- *resptr;
-
- while (nbuffers-- > 0)
- {
- Page page = BufferGetPage(buffers[nbuffers]);
- IndexTuple *vec;
- int veclen;
-
- resptr = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
-
- resptr->block.blkno = BufferGetBlockNumber(buffers[nbuffers]);
- resptr->block.num = PageGetMaxOffsetNumber(page);
-
- vec = gistextractpage(page, &veclen);
- resptr->list = gistfillitupvec(vec, veclen, &(resptr->lenlist));
-
- resptr->next = res;
- res = resptr;
- }
-
- return res;
-}
-
-/*
- * Continue insert after crash. In normal situations, there aren't any
- * incomplete inserts, but if a crash occurs partway through an insertion
- * sequence, we'll need to finish making the index valid at the end of WAL
- * replay.
- *
- * Note that we assume the index is now in a valid state, except for the
- * unfinished insertion. In particular it's safe to invoke gistFindPath();
- * there shouldn't be any garbage pages for it to run into.
- *
- * To complete insert we can't use basic insertion algorithm because
- * during insertion we can't call user-defined support functions of opclass.
- * So, we insert 'invalid' tuples without real key and do it by separate algorithm.
- * 'invalid' tuple should be updated by vacuum full.
- */
-static void
-gistContinueInsert(gistIncompleteInsert *insert)
-{
- IndexTuple *itup;
- int i,
- lenitup;
- Relation index;
-
- index = CreateFakeRelcacheEntry(insert->node);
-
- /*
- * needed vector itup never will be more than initial lenblkno+2, because
- * during this processing Indextuple can be only smaller
- */
- lenitup = insert->lenblk;
- itup = (IndexTuple *) palloc(sizeof(IndexTuple) * (lenitup + 2 /* guarantee root split */ ));
-
- for (i = 0; i < insert->lenblk; i++)
- itup[i] = gist_form_invalid_tuple(insert->blkno[i]);
-
- /*
- * any insertion of itup[] should make LOG message about
- */
-
- if (insert->origblkno == GIST_ROOT_BLKNO)
- {
- /*
- * it was split root, so we should only make new root. it can't be
- * simple insert into root, we should replace all content of root.
- */
- Buffer buffer = XLogReadBuffer(insert->node, GIST_ROOT_BLKNO, true);
-
- gistnewroot(index, buffer, itup, lenitup, NULL);
- UnlockReleaseBuffer(buffer);
- }
- else
- {
- Buffer *buffers;
- Page *pages;
- int numbuffer;
- OffsetNumber *todelete;
-
- /* construct path */
- gistxlogFindPath(index, insert);
-
- Assert(insert->pathlen > 0);
-
- buffers = (Buffer *) palloc(sizeof(Buffer) * (insert->lenblk + 2 /* guarantee root split */ ));
- pages = (Page *) palloc(sizeof(Page) * (insert->lenblk + 2 /* guarantee root split */ ));
- todelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (insert->lenblk + 2 /* guarantee root split */ ));
-
- for (i = 0; i < insert->pathlen; i++)
- {
- int j,
- k,
- pituplen = 0;
- uint8 xlinfo;
- XLogRecData *rdata;
- XLogRecPtr recptr;
- Buffer tempbuffer = InvalidBuffer;
- int ntodelete = 0;
-
- numbuffer = 1;
- buffers[0] = ReadBuffer(index, insert->path[i]);
- LockBuffer(buffers[0], GIST_EXCLUSIVE);
-
- /*
- * we check buffer, because we restored page earlier
- */
- gistcheckpage(index, buffers[0]);
-
- pages[0] = BufferGetPage(buffers[0]);
- Assert(!GistPageIsLeaf(pages[0]));
-
- pituplen = PageGetMaxOffsetNumber(pages[0]);
-
- /* find remove old IndexTuples to remove */
- for (j = 0; j < pituplen && ntodelete < lenitup; j++)
- {
- BlockNumber blkno;
- ItemId iid = PageGetItemId(pages[0], j + FirstOffsetNumber);
- IndexTuple idxtup = (IndexTuple) PageGetItem(pages[0], iid);
-
- blkno = ItemPointerGetBlockNumber(&(idxtup->t_tid));
-
- for (k = 0; k < lenitup; k++)
- if (ItemPointerGetBlockNumber(&(itup[k]->t_tid)) == blkno)
- {
- todelete[ntodelete] = j + FirstOffsetNumber - ntodelete;
- ntodelete++;
- break;
- }
- }
-
- if (ntodelete == 0)
- elog(PANIC, "gistContinueInsert: cannot find pointer to page(s)");
-
- /*
- * we check space with subtraction only first tuple to delete,
- * hope, that wiil be enough space....
- */
-
- if (gistnospace(pages[0], itup, lenitup, *todelete, 0))
- {
-
- /* no space left on page, so we must split */
- buffers[numbuffer] = ReadBuffer(index, P_NEW);
- LockBuffer(buffers[numbuffer], GIST_EXCLUSIVE);
- GISTInitBuffer(buffers[numbuffer], 0);
- pages[numbuffer] = BufferGetPage(buffers[numbuffer]);
- gistfillbuffer(pages[numbuffer], itup, lenitup, FirstOffsetNumber);
- numbuffer++;
-
- if (BufferGetBlockNumber(buffers[0]) == GIST_ROOT_BLKNO)
- {
- Buffer tmp;
-
- /*
- * we split root, just copy content from root to new page
- */
-
- /* sanity check */
- if (i + 1 != insert->pathlen)
- elog(PANIC, "unexpected pathlen in index \"%s\"",
- RelationGetRelationName(index));
-
- /* fill new page, root will be changed later */
- tempbuffer = ReadBuffer(index, P_NEW);
- LockBuffer(tempbuffer, GIST_EXCLUSIVE);
- memcpy(BufferGetPage(tempbuffer), pages[0], BufferGetPageSize(tempbuffer));
-
- /* swap buffers[0] (was root) and temp buffer */
- tmp = buffers[0];
- buffers[0] = tempbuffer;
- tempbuffer = tmp; /* now in tempbuffer GIST_ROOT_BLKNO,
- * it is still unchanged */
-
- pages[0] = BufferGetPage(buffers[0]);
- }
-
- START_CRIT_SECTION();
-
- for (j = 0; j < ntodelete; j++)
- PageIndexTupleDelete(pages[0], todelete[j]);
-
- xlinfo = XLOG_GIST_PAGE_SPLIT;
- rdata = formSplitRdata(index->rd_node, insert->path[i],
- false, &(insert->key),
- gistMakePageLayout(buffers, numbuffer));
-
- }
- else
- {
- START_CRIT_SECTION();
-
- for (j = 0; j < ntodelete; j++)
- PageIndexTupleDelete(pages[0], todelete[j]);
- gistfillbuffer(pages[0], itup, lenitup, InvalidOffsetNumber);
-
- xlinfo = XLOG_GIST_PAGE_UPDATE;
- rdata = formUpdateRdata(index->rd_node, buffers[0],
- todelete, ntodelete,
- itup, lenitup, &(insert->key));
- }
-
- /*
- * use insert->key as mark for completion of insert (form*Rdata()
- * above) for following possible replays
- */
-
- /* write pages, we should mark it dirty befor XLogInsert() */
- for (j = 0; j < numbuffer; j++)
- {
- GistPageGetOpaque(pages[j])->rightlink = InvalidBlockNumber;
- MarkBufferDirty(buffers[j]);
- }
- recptr = XLogInsert(RM_GIST_ID, xlinfo, rdata);
- for (j = 0; j < numbuffer; j++)
- {
- PageSetLSN(pages[j], recptr);
- PageSetTLI(pages[j], ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-
- lenitup = numbuffer;
- for (j = 0; j < numbuffer; j++)
- {
- itup[j] = gist_form_invalid_tuple(BufferGetBlockNumber(buffers[j]));
- UnlockReleaseBuffer(buffers[j]);
- }
-
- if (tempbuffer != InvalidBuffer)
- {
- /*
- * it was a root split, so fill it by new values
- */
- gistnewroot(index, tempbuffer, itup, lenitup, &(insert->key));
- UnlockReleaseBuffer(tempbuffer);
- }
- }
- }
-
- FreeFakeRelcacheEntry(index);
-
- ereport(LOG,
- (errmsg("index %u/%u/%u needs VACUUM FULL or REINDEX to finish crash recovery",
- insert->node.spcNode, insert->node.dbNode, insert->node.relNode),
- errdetail("Incomplete insertion detected during crash replay.")));
-}
-
void
gist_xlog_startup(void)
{
- incomplete_inserts = NIL;
- insertCtx = AllocSetContextCreate(CurrentMemoryContext,
- "GiST recovery temporary context",
- ALLOCSET_DEFAULT_MINSIZE,
- ALLOCSET_DEFAULT_INITSIZE,
- ALLOCSET_DEFAULT_MAXSIZE);
opCtx = createTempGistContext();
}
void
gist_xlog_cleanup(void)
{
- ListCell *l;
- MemoryContext oldCxt;
-
- oldCxt = MemoryContextSwitchTo(opCtx);
-
- foreach(l, incomplete_inserts)
- {
- gistIncompleteInsert *insert = (gistIncompleteInsert *) lfirst(l);
-
- gistContinueInsert(insert);
- MemoryContextReset(opCtx);
- }
- MemoryContextSwitchTo(oldCxt);
-
MemoryContextDelete(opCtx);
- MemoryContextDelete(insertCtx);
-}
-
-bool
-gist_safe_restartpoint(void)
-{
- if (incomplete_inserts)
- return false;
- return true;
}
-
-XLogRecData *
-formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
- ItemPointer key, SplitedPageLayout *dist)
+/*
+ * Write WAL record of a page split.
+ */
+XLogRecPtr
+gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
+ SplitedPageLayout *dist,
+ BlockNumber origrlink, GistNSN orignsn,
+ Buffer leftchildbuf)
{
XLogRecData *rdata;
- gistxlogPageSplit *xlrec = (gistxlogPageSplit *) palloc(sizeof(gistxlogPageSplit));
+ gistxlogPageSplit xlrec;
SplitedPageLayout *ptr;
int npage = 0,
- cur = 1;
+ cur;
+ XLogRecPtr recptr;
- ptr = dist;
- while (ptr)
- {
+ for (ptr = dist; ptr; ptr = ptr->next)
npage++;
- ptr = ptr->next;
- }
rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (npage * 2 + 2));
- xlrec->node = node;
- xlrec->origblkno = blkno;
- xlrec->origleaf = page_is_leaf;
- xlrec->npage = (uint16) npage;
- if (key)
- xlrec->key = *key;
- else
- ItemPointerSetInvalid(&(xlrec->key));
+ xlrec.node = node;
+ xlrec.origblkno = blkno;
+ xlrec.origrlink = origrlink;
+ xlrec.orignsn = orignsn;
+ xlrec.origleaf = page_is_leaf;
+ xlrec.npage = (uint16) npage;
+ xlrec.leftchild =
+ BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
- rdata[0].buffer = InvalidBuffer;
- rdata[0].data = (char *) xlrec;
+ rdata[0].data = (char *) &xlrec;
rdata[0].len = sizeof(gistxlogPageSplit);
- rdata[0].next = NULL;
+ rdata[0].buffer = InvalidBuffer;
+
+ cur = 1;
- ptr = dist;
- while (ptr)
+ /*
+ * Include a full page image of the child buf. (only necessary if a
+ * checkpoint happened since the child page was split)
+ */
+ if (BufferIsValid(leftchildbuf))
{
+ rdata[cur - 1].next = &(rdata[cur]);
+ rdata[cur].data = NULL;
+ rdata[cur].len = 0;
+ rdata[cur].buffer = leftchildbuf;
+ rdata[cur].buffer_std = true;
+ cur++;
+ }
+
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ rdata[cur - 1].next = &(rdata[cur]);
rdata[cur].buffer = InvalidBuffer;
rdata[cur].data = (char *) &(ptr->block);
rdata[cur].len = sizeof(gistxlogPage);
- rdata[cur - 1].next = &(rdata[cur]);
cur++;
+ rdata[cur - 1].next = &(rdata[cur]);
rdata[cur].buffer = InvalidBuffer;
rdata[cur].data = (char *) (ptr->list);
rdata[cur].len = ptr->lenlist;
- rdata[cur - 1].next = &(rdata[cur]);
- rdata[cur].next = NULL;
cur++;
- ptr = ptr->next;
}
+ rdata[cur - 1].next = NULL;
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
- return rdata;
+ pfree(rdata);
+ return recptr;
}
/*
- * Construct the rdata array for an XLOG record describing a page update
- * (deletion and/or insertion of tuples on a single index page).
+ * Write XLOG record describing a page update. The update can include any
+ * number of deletions and/or insertions of tuples on a single index page.
+ *
+ * If this update inserts a downlink for a split page, also record that
+ * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
*
* Note that both the todelete array and the tuples are marked as belonging
* to the target buffer; they need not be stored in XLOG if XLogInsert decides
@@ -911,27 +492,26 @@ formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
* at least one rdata item referencing the buffer, even when ntodelete and
* ituplen are both zero; this ensures that XLogInsert knows about the buffer.
*/
-XLogRecData *
-formUpdateRdata(RelFileNode node, Buffer buffer,
- OffsetNumber *todelete, int ntodelete,
- IndexTuple *itup, int ituplen, ItemPointer key)
+XLogRecPtr
+gistXLogUpdate(RelFileNode node, Buffer buffer,
+ OffsetNumber *todelete, int ntodelete,
+ IndexTuple *itup, int ituplen,
+ Buffer leftchildbuf)
{
XLogRecData *rdata;
gistxlogPageUpdate *xlrec;
int cur,
i;
+ XLogRecPtr recptr;
- rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen));
+ rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen));
xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate));
xlrec->node = node;
xlrec->blkno = BufferGetBlockNumber(buffer);
xlrec->ntodelete = ntodelete;
-
- if (key)
- xlrec->key = *key;
- else
- ItemPointerSetInvalid(&(xlrec->key));
+ xlrec->leftchild =
+ BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
rdata[0].buffer = buffer;
rdata[0].buffer_std = true;
@@ -945,13 +525,13 @@ formUpdateRdata(RelFileNode node, Buffer buffer,
rdata[1].next = &(rdata[2]);
rdata[2].data = (char *) todelete;
- rdata[2].len = MAXALIGN(sizeof(OffsetNumber) * ntodelete);
+ rdata[2].len = sizeof(OffsetNumber) * ntodelete;
rdata[2].buffer = buffer;
rdata[2].buffer_std = true;
- rdata[2].next = NULL;
- /* new tuples */
cur = 3;
+
+ /* new tuples */
for (i = 0; i < ituplen; i++)
{
rdata[cur - 1].next = &(rdata[cur]);
@@ -959,38 +539,26 @@ formUpdateRdata(RelFileNode node, Buffer buffer,
rdata[cur].len = IndexTupleSize(itup[i]);
rdata[cur].buffer = buffer;
rdata[cur].buffer_std = true;
- rdata[cur].next = NULL;
cur++;
}
- return rdata;
-}
-
-XLogRecPtr
-gistxlogInsertCompletion(RelFileNode node, ItemPointerData *keys, int len)
-{
- gistxlogInsertComplete xlrec;
- XLogRecData rdata[2];
- XLogRecPtr recptr;
-
- Assert(len > 0);
- xlrec.node = node;
-
- rdata[0].buffer = InvalidBuffer;
- rdata[0].data = (char *) &xlrec;
- rdata[0].len = sizeof(gistxlogInsertComplete);
- rdata[0].next = &(rdata[1]);
-
- rdata[1].buffer = InvalidBuffer;
- rdata[1].data = (char *) keys;
- rdata[1].len = sizeof(ItemPointerData) * len;
- rdata[1].next = NULL;
-
- START_CRIT_SECTION();
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_INSERT_COMPLETE, rdata);
+ /*
+ * Include a full page image of the child buf. (only necessary if a
+ * checkpoint happened since the child page was split)
+ */
+ if (BufferIsValid(leftchildbuf))
+ {
+ rdata[cur - 1].next = &(rdata[cur]);
+ rdata[cur].data = NULL;
+ rdata[cur].len = 0;
+ rdata[cur].buffer = leftchildbuf;
+ rdata[cur].buffer_std = true;
+ cur++;
+ }
+ rdata[cur - 1].next = NULL;
- END_CRIT_SECTION();
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
+ pfree(rdata);
return recptr;
}
diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile
index 80f9ea61e9..82297606dc 100644
--- a/src/backend/access/hash/Makefile
+++ b/src/backend/access/hash/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/hash
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $
+# src/backend/access/hash/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 026ad40bfb..cd4e058389 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.9 2009/11/01 21:25:25 tgl Exp $
+src/backend/access/hash/README
Hash Indexing
=============
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 6474f45940..4cb29b2bb4 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -3,12 +3,12 @@
* hash.c
* Implementation of Margo Seltzer's Hashing package for postgres.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.117 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/hash/hash.c
*
* NOTES
* This file contains only the public interface routines.
@@ -69,7 +69,7 @@ hashbuild(PG_FUNCTION_ARGS)
estimate_rel_size(heap, NULL, &relpages, &reltuples);
/* Initialize the hash index metadata page and initial buckets */
- num_buckets = _hash_metapinit(index, reltuples);
+ num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
/*
* If we just insert the tuples into the index in scan order, then
@@ -114,6 +114,19 @@ hashbuild(PG_FUNCTION_ARGS)
}
/*
+ * hashbuildempty() -- build an empty hash index in the initialization fork
+ */
+Datum
+hashbuildempty(PG_FUNCTION_ARGS)
+{
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+
+ _hash_metapinit(index, 0, INIT_FORKNUM);
+
+ PG_RETURN_VOID();
+}
+
+/*
* Per-tuple callback from IndexBuildHeapScan
*/
static void
@@ -366,12 +379,16 @@ Datum
hashbeginscan(PG_FUNCTION_ARGS)
{
Relation rel = (Relation) PG_GETARG_POINTER(0);
- int keysz = PG_GETARG_INT32(1);
- ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
+ int nkeys = PG_GETARG_INT32(1);
+ int norderbys = PG_GETARG_INT32(2);
IndexScanDesc scan;
HashScanOpaque so;
- scan = RelationGetIndexScan(rel, keysz, scankey);
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
+
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
so->hashso_bucket_valid = false;
so->hashso_bucket_blkno = 0;
@@ -396,26 +413,24 @@ hashrescan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
+
+ /* remaining arguments are ignored */
HashScanOpaque so = (HashScanOpaque) scan->opaque;
Relation rel = scan->indexRelation;
- /* if we are called from beginscan, so is still NULL */
- if (so)
- {
- /* release any pin we still hold */
- if (BufferIsValid(so->hashso_curbuf))
- _hash_dropbuf(rel, so->hashso_curbuf);
- so->hashso_curbuf = InvalidBuffer;
-
- /* release lock on bucket, too */
- if (so->hashso_bucket_blkno)
- _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
- so->hashso_bucket_blkno = 0;
-
- /* set position invalid (this will cause _hash_first call) */
- ItemPointerSetInvalid(&(so->hashso_curpos));
- ItemPointerSetInvalid(&(so->hashso_heappos));
- }
+ /* release any pin we still hold */
+ if (BufferIsValid(so->hashso_curbuf))
+ _hash_dropbuf(rel, so->hashso_curbuf);
+ so->hashso_curbuf = InvalidBuffer;
+
+ /* release lock on bucket, too */
+ if (so->hashso_bucket_blkno)
+ _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
+ so->hashso_bucket_blkno = 0;
+
+ /* set position invalid (this will cause _hash_first call) */
+ ItemPointerSetInvalid(&(so->hashso_curpos));
+ ItemPointerSetInvalid(&(so->hashso_heappos));
/* Update scan key, if a new one is given */
if (scankey && scan->numberOfKeys > 0)
@@ -423,8 +438,7 @@ hashrescan(PG_FUNCTION_ARGS)
memmove(scan->keyData,
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
- if (so)
- so->hashso_bucket_valid = false;
+ so->hashso_bucket_valid = false;
}
PG_RETURN_VOID();
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index 6b99acff88..e00696eb66 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -3,12 +3,12 @@
* hashfunc.c
* Support functions for hash access method.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.62 2010/01/07 04:53:34 tgl Exp $
+ * src/backend/access/hash/hashfunc.c
*
* NOTES
* These functions are stored in pg_amproc. For each operator class
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 4d1cbbc8da..dd58f16840 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -3,12 +3,12 @@
* hashinsert.c
* Item insertion in hash tables for Postgres.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.54 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/hash/hashinsert.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 3ca8d733ad..ae8b2b1cfd 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -3,12 +3,12 @@
* hashovfl.c
* Overflow page management code for the Postgres hash access method
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.69 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/hash/hashovfl.c
*
* NOTES
* Overflow pages look like ordinary relation pages.
@@ -259,7 +259,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
* convenient to pre-mark them as "in use" too.
*/
bit = metap->hashm_spares[splitnum];
- _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
+ _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
metap->hashm_spares[splitnum]++;
}
else
@@ -280,7 +280,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
* with metapage write lock held; would be better to use a lock that
* doesn't block incoming searches.
*/
- newbuf = _hash_getnewbuf(rel, blkno);
+ newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
metap->hashm_spares[splitnum]++;
@@ -503,7 +503,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
* All bits in the new bitmap page are set to "1", indicating "in use".
*/
void
-_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
+_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
+ ForkNumber forkNum)
{
Buffer buf;
Page pg;
@@ -520,7 +521,7 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
* page while holding the metapage lock, but this path is taken so seldom
* that it's not worth worrying about.
*/
- buf = _hash_getnewbuf(rel, blkno);
+ buf = _hash_getnewbuf(rel, blkno, forkNum);
pg = BufferGetPage(buf);
/* initialize the page's special space */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 77b072c880..fe991cf27b 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -3,12 +3,12 @@
* hashpage.c
* Hash table page management code for the Postgres hash access method
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.83 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/hash/hashpage.c
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -183,9 +183,9 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
* extend the index at a time.
*/
Buffer
-_hash_getnewbuf(Relation rel, BlockNumber blkno)
+_hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
{
- BlockNumber nblocks = RelationGetNumberOfBlocks(rel);
+ BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
Buffer buf;
if (blkno == P_NEW)
@@ -197,13 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
/* smgr insists we use P_NEW to extend the relation */
if (blkno == nblocks)
{
- buf = ReadBuffer(rel, P_NEW);
+ buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
if (BufferGetBlockNumber(buf) != blkno)
elog(ERROR, "unexpected hash relation size: %u, should be %u",
BufferGetBlockNumber(buf), blkno);
}
else
- buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
+ buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
LockBuffer(buf, HASH_WRITE);
@@ -324,7 +324,7 @@ _hash_chgbufaccess(Relation rel,
* multiple buffer locks is ignored.
*/
uint32
-_hash_metapinit(Relation rel, double num_tuples)
+_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
{
HashMetaPage metap;
HashPageOpaque pageopaque;
@@ -340,7 +340,7 @@ _hash_metapinit(Relation rel, double num_tuples)
uint32 i;
/* safety check */
- if (RelationGetNumberOfBlocks(rel) != 0)
+ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
RelationGetRelationName(rel));
@@ -383,7 +383,7 @@ _hash_metapinit(Relation rel, double num_tuples)
* calls to occur. This ensures that the smgr level has the right idea of
* the physical index length.
*/
- metabuf = _hash_getnewbuf(rel, HASH_METAPAGE);
+ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
pg = BufferGetPage(metabuf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -451,7 +451,7 @@ _hash_metapinit(Relation rel, double num_tuples)
/* Allow interrupts, in case N is huge */
CHECK_FOR_INTERRUPTS();
- buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
+ buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
pg = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
pageopaque->hasho_prevblkno = InvalidBlockNumber;
@@ -468,7 +468,7 @@ _hash_metapinit(Relation rel, double num_tuples)
/*
* Initialize first bitmap page
*/
- _hash_initbitmap(rel, metap, num_buckets + 1);
+ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum);
/* all done */
_hash_wrtbuf(rel, metabuf);
@@ -733,7 +733,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
MemSet(zerobuf, 0, sizeof(zerobuf));
RelationOpenSmgr(rel);
- smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, rel->rd_istemp);
+ smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
return true;
}
@@ -785,7 +785,7 @@ _hash_splitbucket(Relation rel,
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
nblkno = start_nblkno;
- nbuf = _hash_getnewbuf(rel, nblkno);
+ nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
npage = BufferGetPage(nbuf);
/* initialize the new bucket's primary page */
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
index fd2486a556..93d63b86ea 100644
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -3,12 +3,12 @@
* hashscan.c
* manage scans on hash tables
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.48 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/hash/hashscan.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index b1ccff673e..bf42be103f 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -3,12 +3,12 @@
* hashsearch.c
* search code for postgres hash tables
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.59 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/hash/hashsearch.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c
index b250e814f9..dbb9c3f39b 100644
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -14,11 +14,11 @@
* plenty of locality of access.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.4 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/hash/hashsort.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index d6676702cd..6283f4a82b 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -3,12 +3,12 @@
* hashutil.c
* Utility code for Postgres hash implementation.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.62 2010/01/02 16:57:34 momjian Exp $
+ * src/backend/access/hash/hashutil.c
*
*-------------------------------------------------------------------------
*/
@@ -56,7 +56,8 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
if (key->sk_flags & SK_ISNULL)
return false;
- test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
+ test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+ datum, key->sk_argument);
if (!DatumGetBool(test))
return false;
@@ -79,11 +80,13 @@ uint32
_hash_datum2hashkey(Relation rel, Datum key)
{
FmgrInfo *procinfo;
+ Oid collation;
/* XXX assumes index has only one attribute */
procinfo = index_getprocinfo(rel, 1, HASHPROC);
+ collation = rel->rd_indcollation[0];
- return DatumGetUInt32(FunctionCall1(procinfo, key));
+ return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
}
/*
@@ -97,6 +100,7 @@ uint32
_hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
{
RegProcedure hash_proc;
+ Oid collation;
/* XXX assumes index has only one attribute */
hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
@@ -107,8 +111,9 @@ _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
HASHPROC, keytype, keytype,
RelationGetRelationName(rel));
+ collation = rel->rd_indcollation[0];
- return DatumGetUInt32(OidFunctionCall1(hash_proc, key));
+ return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
}
/*
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index dc33054641..b83d496bcd 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/heap
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.19 2008/12/03 13:05:22 heikki Exp $
+# src/backend/access/heap/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
index 2f9162e52d..f12cad44e5 100644
--- a/src/backend/access/heap/README.HOT
+++ b/src/backend/access/heap/README.HOT
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.6 2010/04/23 23:21:44 rhaas Exp $
+src/backend/access/heap/README.HOT
Heap Only Tuples (HOT)
======================
@@ -246,6 +246,12 @@ scans reasonably sized, the maximum number of line pointers per page
is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
could fit without HOT pruning).
+Effectively, space reclamation happens during tuple retrieval when the
+page is nearly full (<10% free) and a buffer cleanup lock can be
+acquired. This means that UPDATE, DELETE, and SELECT can trigger space
+reclamation, but often not during INSERT ... VALUES because it does
+not retrieve a row.
+
VACUUM
------
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e17b6362c3..01a492e496 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3,12 +3,12 @@
* heapam.c
* heap access method code
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.292 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/heap/heapam.c
*
*
* INTERFACE ROUTINES
@@ -57,6 +57,7 @@
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "storage/standby.h"
@@ -124,7 +125,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
*
* During a rescan, don't make a new strategy object if we don't have to.
*/
- if (!scan->rs_rd->rd_istemp &&
+ if (!RelationUsesLocalBuffers(scan->rs_rd) &&
scan->rs_nblocks > NBuffers / 4)
{
allow_strat = scan->rs_allow_strat;
@@ -261,20 +262,20 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
{
if (ItemIdIsNormal(lpp))
{
+ HeapTupleData loctup;
bool valid;
+ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
+ loctup.t_len = ItemIdGetLength(lpp);
+ ItemPointerSet(&(loctup.t_self), page, lineoff);
+
if (all_visible)
valid = true;
else
- {
- HeapTupleData loctup;
+ valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
- loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
- loctup.t_len = ItemIdGetLength(lpp);
- ItemPointerSet(&(loctup.t_self), page, lineoff);
+ CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer);
- valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
- }
if (valid)
scan->rs_vistuples[ntup++] = lineoff;
}
@@ -468,12 +469,16 @@ heapgettup(HeapScanDesc scan,
snapshot,
scan->rs_cbuf);
+ CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf);
+
if (valid && key != NULL)
HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
nkeys, key, valid);
if (valid)
{
+ if (!scan->rs_relpredicatelocked)
+ PredicateLockTuple(scan->rs_rd, tuple);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
return;
}
@@ -741,12 +746,16 @@ heapgettup_pagemode(HeapScanDesc scan,
nkeys, key, valid);
if (valid)
{
+ if (!scan->rs_relpredicatelocked)
+ PredicateLockTuple(scan->rs_rd, tuple);
scan->rs_cindex = lineindex;
return;
}
}
else
{
+ if (!scan->rs_relpredicatelocked)
+ PredicateLockTuple(scan->rs_rd, tuple);
scan->rs_cindex = lineindex;
return;
}
@@ -905,7 +914,7 @@ relation_open(Oid relationId, LOCKMODE lockmode)
elog(ERROR, "could not open relation with OID %u", relationId);
/* Make note that we've accessed a temporary relation */
- if (r->rd_istemp)
+ if (RelationUsesLocalBuffers(r))
MyXactAccessedTempRel = true;
pgstat_initstats(r);
@@ -951,7 +960,7 @@ try_relation_open(Oid relationId, LOCKMODE lockmode)
elog(ERROR, "could not open relation with OID %u", relationId);
/* Make note that we've accessed a temporary relation */
- if (r->rd_istemp)
+ if (RelationUsesLocalBuffers(r))
MyXactAccessedTempRel = true;
pgstat_initstats(r);
@@ -1060,7 +1069,8 @@ relation_close(Relation relation, LOCKMODE lockmode)
*
* This is essentially relation_open plus check that the relation
* is not an index nor a composite type. (The caller should also
- * check that it's not a view before assuming it has storage.)
+ * check that it's not a view or foreign table before assuming it has
+ * storage.)
* ----------------
*/
Relation
@@ -1212,6 +1222,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot,
scan->rs_strategy = NULL; /* set in initscan */
scan->rs_allow_strat = allow_strat;
scan->rs_allow_sync = allow_sync;
+ scan->rs_relpredicatelocked = false;
/*
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1458,6 +1469,11 @@ heap_fetch(Relation relation,
*/
valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
+ if (valid)
+ PredicateLockTuple(relation, tuple);
+
+ CheckForSerializableConflictOut(valid, relation, tuple, buffer);
+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (valid)
@@ -1505,13 +1521,14 @@ heap_fetch(Relation relation,
* heap_fetch, we do not report any pgstats count; caller may do so if wanted.
*/
bool
-heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
- bool *all_dead)
+heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
+ Snapshot snapshot, bool *all_dead)
{
Page dp = (Page) BufferGetPage(buffer);
TransactionId prev_xmax = InvalidTransactionId;
OffsetNumber offnum;
bool at_chain_start;
+ bool valid;
if (all_dead)
*all_dead = true;
@@ -1551,6 +1568,8 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
heapTuple.t_len = ItemIdGetLength(lp);
+ heapTuple.t_tableOid = relation->rd_id;
+ heapTuple.t_self = *tid;
/*
* Shouldn't see a HEAP_ONLY tuple at chain start.
@@ -1568,9 +1587,12 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
break;
/* If it's visible per the snapshot, we must return it */
- if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+ valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer);
+ CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer);
+ if (valid)
{
ItemPointerSetOffsetNumber(tid, offnum);
+ PredicateLockTuple(relation, &heapTuple);
if (all_dead)
*all_dead = false;
return true;
@@ -1621,7 +1643,7 @@ heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
LockBuffer(buffer, BUFFER_LOCK_SHARE);
- result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+ result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
return result;
@@ -1728,6 +1750,7 @@ heap_get_latest_tid(Relation relation,
* result candidate.
*/
valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
+ CheckForSerializableConflictOut(valid, relation, &tp, buffer);
if (valid)
*tid = ctid;
@@ -1892,6 +1915,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
InvalidBuffer, options, bistate);
+ /*
+ * We're about to do the actual insert -- check for conflict at the
+ * relation or buffer level first, to avoid possibly having to roll back
+ * work we've just done.
+ */
+ CheckForSerializableConflictIn(relation, NULL, buffer);
+
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
@@ -1917,7 +1947,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (!(options & HEAP_INSERT_SKIP_WAL) && !relation->rd_istemp)
+ if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
{
xl_heap_insert xlrec;
xl_heap_header xlhdr;
@@ -2173,7 +2203,7 @@ l1:
if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
{
- /* Perform additional check for serializable RI updates */
+ /* Perform additional check for transaction-snapshot mode RI updates */
if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
result = HeapTupleUpdated;
}
@@ -2192,6 +2222,12 @@ l1:
return result;
}
+ /*
+ * We're about to do the actual delete -- check for conflict first, to
+ * avoid possibly having to roll back work we've just done.
+ */
+ CheckForSerializableConflictIn(relation, &tp, buffer);
+
/* replace cid with a combo cid if necessary */
HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
@@ -2227,7 +2263,7 @@ l1:
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (!relation->rd_istemp)
+ if (RelationNeedsWAL(relation))
{
xl_heap_delete xlrec;
XLogRecPtr recptr;
@@ -2525,7 +2561,7 @@ l2:
if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
{
- /* Perform additional check for serializable RI updates */
+ /* Perform additional check for transaction-snapshot mode RI updates */
if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
result = HeapTupleUpdated;
}
@@ -2545,6 +2581,12 @@ l2:
return result;
}
+ /*
+ * We're about to do the actual update -- check for conflict first, to
+ * avoid possibly having to roll back work we've just done.
+ */
+ CheckForSerializableConflictIn(relation, &oldtup, buffer);
+
/* Fill in OID and transaction status data for newtup */
if (relation->rd_rel->relhasoids)
{
@@ -2690,6 +2732,16 @@ l2:
}
/*
+ * We're about to create the new tuple -- check for conflict first, to
+ * avoid possibly having to roll back work we've just done.
+ *
+ * NOTE: For a tuple insert, we only need to check for table locks, since
+ * predicate locking at the index level will cover ranges for anything
+ * except a table scan. Therefore, only provide the relation.
+ */
+ CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+
+ /*
* At this point newbuf and buffer are both pinned and locked, and newbuf
* has enough space for the new tuple. If they are the same buffer, only
* one pin is held.
@@ -2780,7 +2832,7 @@ l2:
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (!relation->rd_istemp)
+ if (RelationNeedsWAL(relation))
{
XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
newbuf, heaptup,
@@ -3403,7 +3455,7 @@ l3:
* (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
* entries for everything anyway.)
*/
- if (!relation->rd_istemp)
+ if (RelationNeedsWAL(relation))
{
xl_heap_lock xlrec;
XLogRecPtr recptr;
@@ -3505,7 +3557,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (!relation->rd_istemp)
+ if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
XLogRecPtr recptr;
@@ -3776,8 +3828,11 @@ heap_restrpos(HeapScanDesc scan)
}
/*
- * If 'tuple' contains any XID greater than latestRemovedXid, update
- * latestRemovedXid to the greatest one found.
+ * If 'tuple' contains any visible XID greater than latestRemovedXid,
+ * ratchet forwards latestRemovedXid to the greatest one found.
+ * This is used as the basis for generating Hot Standby conflicts, so
+ * if a tuple was never visible then removing it should not conflict
+ * with queries.
*/
void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
@@ -3793,13 +3848,25 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
*latestRemovedXid = xvac;
}
- if (TransactionIdPrecedes(*latestRemovedXid, xmax))
- *latestRemovedXid = xmax;
-
- if (TransactionIdPrecedes(*latestRemovedXid, xmin))
- *latestRemovedXid = xmin;
+ /*
+ * Ignore tuples inserted by an aborted transaction or if the tuple was
+ * updated/deleted by the inserting transaction.
+ *
+ * Look for a committed hint bit, or if no xmin bit is set, check clog.
+ * This needs to work on both master and standby, where it is used to
+ * assess btree delete records.
+ */
+ if ((tuple->t_infomask & HEAP_XMIN_COMMITTED) ||
+ (!(tuple->t_infomask & HEAP_XMIN_COMMITTED) &&
+ !(tuple->t_infomask & HEAP_XMIN_INVALID) &&
+ TransactionIdDidCommit(xmin)))
+ {
+ if (xmax != xmin &&
+ TransactionIdFollows(xmax, *latestRemovedXid))
+ *latestRemovedXid = xmax;
+ }
- Assert(TransactionIdIsValid(*latestRemovedXid));
+ /* *latestRemovedXid may still be invalid at end */
}
/*
@@ -3852,8 +3919,8 @@ log_heap_clean(Relation reln, Buffer buffer,
XLogRecPtr recptr;
XLogRecData rdata[4];
- /* Caller should not call me on a temp relation */
- Assert(!reln->rd_istemp);
+ /* Caller should not call me on a non-WAL-logged relation */
+ Assert(RelationNeedsWAL(reln));
xlrec.node = reln->rd_node;
xlrec.block = BufferGetBlockNumber(buffer);
@@ -3935,8 +4002,8 @@ log_heap_freeze(Relation reln, Buffer buffer,
XLogRecPtr recptr;
XLogRecData rdata[2];
- /* Caller should not call me on a temp relation */
- Assert(!reln->rd_istemp);
+ /* Caller should not call me on a non-WAL-logged relation */
+ Assert(RelationNeedsWAL(reln));
/* nor when there are no tuples to freeze */
Assert(offcnt > 0);
@@ -3981,8 +4048,8 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
XLogRecData rdata[4];
Page page = BufferGetPage(newbuf);
- /* Caller should not call me on a temp relation */
- Assert(!reln->rd_istemp);
+ /* Caller should not call me on a non-WAL-logged relation */
+ Assert(RelationNeedsWAL(reln));
if (HeapTupleIsHeapOnly(newtup))
info = XLOG_HEAP_HOT_UPDATE;
@@ -4079,8 +4146,15 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
+ /*
+ * The page may be uninitialized. If so, we can't set the LSN and TLI
+ * because that would corrupt the page.
+ */
+ if (!PageIsNew(page))
+ {
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
END_CRIT_SECTION();
@@ -4266,8 +4340,16 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
memcpy(page, (char *) xlrec + SizeOfHeapNewpage, BLCKSZ);
- PageSetLSN(page, lsn);
- PageSetTLI(page, ThisTimeLineID);
+ /*
+ * The page may be uninitialized. If so, we can't set the LSN and TLI
+ * because that would corrupt the page.
+ */
+ if (!PageIsNew(page))
+ {
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
@@ -4967,7 +5049,7 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
* heap_sync - sync a heap, for use when no WAL has been written
*
* This forces the heap contents (including TOAST heap if any) down to disk.
- * If we skipped using WAL, and it's not a temp relation, we must force the
+ * If we skipped using WAL, and WAL is otherwise needed, we must force the
* relation down to disk before it's safe to commit the transaction. This
* requires writing out any dirty buffers and then doing a forced fsync.
*
@@ -4980,8 +5062,8 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
void
heap_sync(Relation rel)
{
- /* temp tables never need fsync */
- if (rel->rd_istemp)
+ /* non-WAL-logged tables never need fsync */
+ if (!RelationNeedsWAL(rel))
return;
/* main heap */
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 83915ba33d..72a69e52b0 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -3,12 +3,12 @@
* hio.c
* POSTGRES heap access method input/output code.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.78 2010/02/09 21:43:29 tgl Exp $
+ * src/backend/access/heap/hio.c
*
*-------------------------------------------------------------------------
*/
@@ -150,7 +150,7 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
Buffer
RelationGetBufferForTuple(Relation relation, Size len,
Buffer otherBuffer, int options,
- struct BulkInsertStateData *bistate)
+ struct BulkInsertStateData * bistate)
{
bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
Buffer buffer = InvalidBuffer;
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 3332e085b8..0cfa866108 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -3,12 +3,12 @@
* pruneheap.c
* heap page pruning and HOT-chain management code
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.25 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/heap/pruneheap.c
*
*-------------------------------------------------------------------------
*/
@@ -233,11 +233,10 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
/*
* Emit a WAL HEAP_CLEAN record showing what we did
*/
- if (!relation->rd_istemp)
+ if (RelationNeedsWAL(relation))
{
XLogRecPtr recptr;
- Assert(TransactionIdIsValid(prstate.latestRemovedXid));
recptr = log_heap_clean(relation, buffer,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index e35fbeb9b1..e56140950a 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -92,11 +92,11 @@
* heap's TOAST table will go through the normal bufmgr.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.22 2010/04/28 16:10:40 heikki Exp $
+ * src/backend/access/heap/rewriteheap.c
*
*-------------------------------------------------------------------------
*/
@@ -131,7 +131,7 @@ typedef struct RewriteStateData
* them */
HTAB *rs_unresolved_tups; /* unmatched A tuples */
HTAB *rs_old_new_tid_map; /* unmatched B tuples */
-} RewriteStateData;
+} RewriteStateData;
/*
* The lookup keys for the hash tables are tuple TID and xmin (we must check
@@ -254,8 +254,6 @@ end_heap_rewrite(RewriteState state)
/*
* Write any remaining tuples in the UnresolvedTups table. If we have any
* left, they should in fact be dead, but let's err on the safe side.
- *
- * XXX this really is a waste of code no?
*/
hash_seq_init(&seq_status, state->rs_unresolved_tups);
@@ -279,8 +277,8 @@ end_heap_rewrite(RewriteState state)
}
/*
- * If the rel isn't temp, must fsync before commit. We use heap_sync to
- * ensure that the toast table gets fsync'd too.
+ * If the rel is WAL-logged, must fsync before commit. We use heap_sync
+ * to ensure that the toast table gets fsync'd too.
*
* It's obvious that we must do this when not WAL-logging. It's less
* obvious that we have to do it even if we did WAL-log the pages. The
@@ -289,7 +287,7 @@ end_heap_rewrite(RewriteState state)
* occurring during the rewriteheap operation won't have fsync'd data we
* wrote before the checkpoint.
*/
- if (!state->rs_new_rel->rd_istemp)
+ if (RelationNeedsWAL(state->rs_new_rel))
heap_sync(state->rs_new_rel);
/* Deleting the context frees everything */
@@ -502,8 +500,12 @@ rewrite_heap_tuple(RewriteState state,
* Register a dead tuple with an ongoing rewrite. Dead tuples are not
* copied to the new table, but we still make note of them so that we
* can release some resources earlier.
+ *
+ * Returns true if a tuple was removed from the unresolved_tups table.
+ * This indicates that that tuple, previously thought to be "recently dead",
+ * is now known really dead and won't be written to the output.
*/
-void
+bool
rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
{
/*
@@ -539,7 +541,10 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
hash_search(state->rs_unresolved_tups, &hashkey,
HASH_REMOVE, &found);
Assert(found);
+ return true;
}
+
+ return false;
}
/*
@@ -636,7 +641,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
}
/* And now we can insert the tuple into the page */
- newoff = PageAddItem(page, (Item) heaptup->t_data, len,
+ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len,
InvalidOffsetNumber, false, true);
if (newoff == InvalidOffsetNumber)
elog(ERROR, "failed to add tuple");
diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/heap/syncscan.c
index 8dc2bb476d..957d1a12f3 100644
--- a/src/backend/access/heap/syncscan.c
+++ b/src/backend/access/heap/syncscan.c
@@ -36,11 +36,11 @@
* ss_report_location - update current scan location
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/syncscan.c,v 1.7 2010/01/02 16:57:35 momjian Exp $
+ * src/backend/access/heap/syncscan.c
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index 7518db16c8..4f4dd69291 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -4,11 +4,11 @@
* Support routines for external and compressed storage of
* variable size attributes.
*
- * Copyright (c) 2000-2010, PostgreSQL Global Development Group
+ * Copyright (c) 2000-2011, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.98 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/heap/tuptoaster.c
*
*
* INTERFACE ROUTINES
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 88a0c74e32..58bab7df10 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -3,12 +3,12 @@
* visibilitymap.c
* bitmap for tracking visibility of heap tuples
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.10 2010/04/23 23:21:44 rhaas Exp $
+ * src/backend/access/heap/visibilitymap.c
*
* INTERFACE ROUTINES
* visibilitymap_clear - clear a bit in the visibility map
@@ -373,8 +373,7 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
}
/* Truncate the unused VM pages, and send smgr inval message */
- smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks,
- rel->rd_istemp);
+ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
/*
* We might as well update the local smgr_vm_nblocks setting. smgrtruncate
@@ -478,7 +477,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
while (vm_nblocks_now < vm_nblocks)
{
smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
- (char *) pg, rel->rd_istemp);
+ (char *) pg, false);
vm_nblocks_now++;
}
diff --git a/src/backend/access/index/Makefile b/src/backend/access/index/Makefile
index 6357a43f89..96490db032 100644
--- a/src/backend/access/index/Makefile
+++ b/src/backend/access/index/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/index
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/index/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
+# src/backend/access/index/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index d95fd90a42..db04e26a65 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -3,12 +3,12 @@
* genam.c
* general index access method routines
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.81 2010/02/26 02:00:33 momjian Exp $
+ * src/backend/access/index/genam.c
*
* NOTES
* many of the old access method routines have been turned into
@@ -57,22 +57,20 @@
/* ----------------
* RelationGetIndexScan -- Create and fill an IndexScanDesc.
*
- * This routine creates an index scan structure and sets its contents
- * up correctly. This routine calls AMrescan to set up the scan with
- * the passed key.
+ * This routine creates an index scan structure and sets up initial
+ * contents for it.
*
* Parameters:
* indexRelation -- index relation for scan.
- * nkeys -- count of scan keys.
- * key -- array of scan keys to restrict the index scan.
+ * nkeys -- count of scan keys (index qual conditions).
+ * norderbys -- count of index order-by operators.
*
* Returns:
* An initialized IndexScanDesc.
* ----------------
*/
IndexScanDesc
-RelationGetIndexScan(Relation indexRelation,
- int nkeys, ScanKey key)
+RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
{
IndexScanDesc scan;
@@ -82,15 +80,19 @@ RelationGetIndexScan(Relation indexRelation,
scan->indexRelation = indexRelation;
scan->xs_snapshot = SnapshotNow; /* may be set later */
scan->numberOfKeys = nkeys;
+ scan->numberOfOrderBys = norderbys;
/*
- * We allocate the key space here, but the AM is responsible for actually
- * filling it from the passed key array.
+ * We allocate key workspace here, but it won't get filled until amrescan.
*/
if (nkeys > 0)
scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
else
scan->keyData = NULL;
+ if (norderbys > 0)
+ scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys);
+ else
+ scan->orderByData = NULL;
/*
* During recovery we ignore killed tuples and don't bother to kill them
@@ -115,11 +117,6 @@ RelationGetIndexScan(Relation indexRelation,
scan->xs_next_hot = InvalidOffsetNumber;
scan->xs_prev_xmax = InvalidTransactionId;
- /*
- * Let the AM fill in the key and any opaque data it wants.
- */
- index_rescan(scan, key);
-
return scan;
}
@@ -140,6 +137,8 @@ IndexScanEnd(IndexScanDesc scan)
{
if (scan->keyData != NULL)
pfree(scan->keyData);
+ if (scan->orderByData != NULL)
+ pfree(scan->orderByData);
pfree(scan);
}
@@ -286,7 +285,8 @@ systable_beginscan(Relation heapRelation,
}
sysscan->iscan = index_beginscan(heapRelation, irel,
- snapshot, nkeys, key);
+ snapshot, nkeys, 0);
+ index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
sysscan->scan = NULL;
}
else
@@ -450,7 +450,8 @@ systable_beginscan_ordered(Relation heapRelation,
}
sysscan->iscan = index_beginscan(heapRelation, indexRelation,
- snapshot, nkeys, key);
+ snapshot, nkeys, 0);
+ index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
sysscan->scan = NULL;
return sysscan;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 3e7331ae7b..0208765964 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -3,12 +3,12 @@
* indexam.c
* general index access method routines
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.118 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/index/indexam.c
*
* INTERFACE ROUTINES
* index_open - open an index relation by relation OID
@@ -64,9 +64,12 @@
#include "access/relscan.h"
#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/index.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "utils/relcache.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
@@ -74,12 +77,21 @@
/* ----------------------------------------------------------------
* macros used in index_ routines
+ *
+ * Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there
+ * to check that we don't try to scan or do retail insertions into an index
+ * that is currently being rebuilt or pending rebuild. This helps to catch
+ * things that don't work when reindexing system catalogs. The assertion
+ * doesn't prevent the actual rebuild because we don't use RELATION_CHECKS
+ * when calling the index AM's ambuild routine, and there is no reason for
+ * ambuild to call its subsidiary routines through this file.
* ----------------------------------------------------------------
*/
#define RELATION_CHECKS \
( \
AssertMacro(RelationIsValid(indexRelation)), \
- AssertMacro(PointerIsValid(indexRelation->rd_am)) \
+ AssertMacro(PointerIsValid(indexRelation->rd_am)), \
+ AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \
)
#define SCAN_CHECKS \
@@ -114,7 +126,7 @@ do { \
} while(0)
static IndexScanDesc index_beginscan_internal(Relation indexRelation,
- int nkeys, ScanKey key);
+ int nkeys, int norderbys);
/* ----------------------------------------------------------------
@@ -192,6 +204,11 @@ index_insert(Relation indexRelation,
RELATION_CHECKS;
GET_REL_PROCEDURE(aminsert);
+ if (!(indexRelation->rd_am->ampredlocks))
+ CheckForSerializableConflictIn(indexRelation,
+ (HeapTuple) NULL,
+ InvalidBuffer);
+
/*
* have the am's insert proc do all the work.
*/
@@ -213,11 +230,11 @@ IndexScanDesc
index_beginscan(Relation heapRelation,
Relation indexRelation,
Snapshot snapshot,
- int nkeys, ScanKey key)
+ int nkeys, int norderbys)
{
IndexScanDesc scan;
- scan = index_beginscan_internal(indexRelation, nkeys, key);
+ scan = index_beginscan_internal(indexRelation, nkeys, norderbys);
/*
* Save additional parameters into the scandesc. Everything else was set
@@ -238,11 +255,11 @@ index_beginscan(Relation heapRelation,
IndexScanDesc
index_beginscan_bitmap(Relation indexRelation,
Snapshot snapshot,
- int nkeys, ScanKey key)
+ int nkeys)
{
IndexScanDesc scan;
- scan = index_beginscan_internal(indexRelation, nkeys, key);
+ scan = index_beginscan_internal(indexRelation, nkeys, 0);
/*
* Save additional parameters into the scandesc. Everything else was set
@@ -258,7 +275,7 @@ index_beginscan_bitmap(Relation indexRelation,
*/
static IndexScanDesc
index_beginscan_internal(Relation indexRelation,
- int nkeys, ScanKey key)
+ int nkeys, int norderbys)
{
IndexScanDesc scan;
FmgrInfo *procedure;
@@ -266,6 +283,9 @@ index_beginscan_internal(Relation indexRelation,
RELATION_CHECKS;
GET_REL_PROCEDURE(ambeginscan);
+ if (!(indexRelation->rd_am->ampredlocks))
+ PredicateLockRelation(indexRelation);
+
/*
* We hold a reference count to the relcache entry throughout the scan.
*/
@@ -278,7 +298,7 @@ index_beginscan_internal(Relation indexRelation,
DatumGetPointer(FunctionCall3(procedure,
PointerGetDatum(indexRelation),
Int32GetDatum(nkeys),
- PointerGetDatum(key)));
+ Int32GetDatum(norderbys)));
return scan;
}
@@ -286,23 +306,28 @@ index_beginscan_internal(Relation indexRelation,
/* ----------------
* index_rescan - (re)start a scan of an index
*
- * The caller may specify a new set of scankeys (but the number of keys
- * cannot change). To restart the scan without changing keys, pass NULL
- * for the key array.
- *
- * Note that this is also called when first starting an indexscan;
- * see RelationGetIndexScan. Keys *must* be passed in that case,
- * unless scan->numberOfKeys is zero.
+ * During a restart, the caller may specify a new set of scankeys and/or
+ * orderbykeys; but the number of keys cannot differ from what index_beginscan
+ * was told. (Later we might relax that to "must not exceed", but currently
+ * the index AMs tend to assume that scan->numberOfKeys is what to believe.)
+ * To restart the scan without changing keys, pass NULL for the key arrays.
+ * (Of course, keys *must* be passed on the first call, unless
+ * scan->numberOfKeys is zero.)
* ----------------
*/
void
-index_rescan(IndexScanDesc scan, ScanKey key)
+index_rescan(IndexScanDesc scan,
+ ScanKey keys, int nkeys,
+ ScanKey orderbys, int norderbys)
{
FmgrInfo *procedure;
SCAN_CHECKS;
GET_SCAN_PROCEDURE(amrescan);
+ Assert(nkeys == scan->numberOfKeys);
+ Assert(norderbys == scan->numberOfOrderBys);
+
/* Release any held pin on a heap page */
if (BufferIsValid(scan->xs_cbuf))
{
@@ -314,9 +339,12 @@ index_rescan(IndexScanDesc scan, ScanKey key)
scan->kill_prior_tuple = false; /* for safety */
- FunctionCall2(procedure,
+ FunctionCall5(procedure,
PointerGetDatum(scan),
- PointerGetDatum(key));
+ PointerGetDatum(keys),
+ Int32GetDatum(nkeys),
+ PointerGetDatum(orderbys),
+ Int32GetDatum(norderbys));
}
/* ----------------
@@ -515,6 +543,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
{
ItemId lp;
ItemPointer ctid;
+ bool valid;
/* check for bogus TID */
if (offnum < FirstOffsetNumber ||
@@ -569,8 +598,13 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
break;
/* If it's visible per the snapshot, we must return it */
- if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
- scan->xs_cbuf))
+ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+ scan->xs_cbuf);
+
+ CheckForSerializableConflictOut(valid, scan->heapRelation,
+ heapTuple, scan->xs_cbuf);
+
+ if (valid)
{
/*
* If the snapshot is MVCC, we know that it could accept at
@@ -590,6 +624,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
else
scan->xs_next_hot = InvalidOffsetNumber;
+ PredicateLockTuple(scan->heapRelation, heapTuple);
+
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
pgstat_count_heap_fetch(scan->indexRelation);
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index 676ae3c270..2d76d648e0 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/nbtree
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/nbtree/Makefile,v 1.16 2008/02/19 10:30:06 petere Exp $
+# src/backend/access/nbtree/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 57d6308ada..561ffbb9d4 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.22 2010/02/08 04:33:53 tgl Exp $
+src/backend/access/nbtree/README
Btree Indexing
==============
@@ -154,7 +154,7 @@ even pages that don't contain any deletable tuples. This guarantees that
the btbulkdelete call cannot return while any indexscan is still holding
a copy of a deleted index tuple. Note that this requirement does not say
that btbulkdelete must visit the pages in any particular order. (See also
-on-the-fly deletion, below.)
+on-the-fly deletion, below.)
There is no such interlocking for deletion of items in internal pages,
since backends keep no lock nor pin on a page they have descended past.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 13c127a93c..23f2b61fe9 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -3,12 +3,12 @@
* nbtcompare.c
* Comparison functions for btree access method.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.58 2010/01/02 16:57:35 momjian Exp $
+ * src/backend/access/nbtree/nbtcompare.c
*
* NOTES
*
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index cd70a4c73e..d758659c31 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -3,12 +3,12 @@
* nbtinsert.c
* Item insertion in Lehman and Yao btrees for Postgres.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.178 2010/03/28 09:27:01 sriggs Exp $
+ * src/backend/access/nbtree/nbtinsert.c
*
*-------------------------------------------------------------------------
*/
@@ -21,6 +21,7 @@
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "utils/inval.h"
#include "utils/tqual.h"
@@ -74,9 +75,8 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
static void _bt_checksplitloc(FindSplitData *state,
OffsetNumber firstoldonright, bool newitemonleft,
int dataitemstoleft, Size firstoldonrightsz);
-static void _bt_pgaddtup(Relation rel, Page page,
- Size itemsize, IndexTuple itup,
- OffsetNumber itup_off, const char *where);
+static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+ OffsetNumber itup_off);
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
@@ -175,6 +175,14 @@ top:
if (checkUnique != UNIQUE_CHECK_EXISTING)
{
+ /*
+ * The only conflict predicate locking cares about for indexes is when
+ * an index tuple insert conflicts with an existing lock. Since the
+ * actual location of the insert is hard to predict because of the
+ * random search used to prevent O(N^2) performance when there are
+ * many duplicate entries, we can just use the "first valid" page.
+ */
+ CheckForSerializableConflictIn(rel, NULL, buf);
/* do the insertion */
_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
_bt_insertonpg(rel, buf, stack, itup, offset, false);
@@ -697,6 +705,9 @@ _bt_insertonpg(Relation rel,
/* split the buffer into left and right halves */
rbuf = _bt_split(rel, buf, firstright,
newitemoff, itemsz, itup, newitemonleft);
+ PredicateLockPageSplit(rel,
+ BufferGetBlockNumber(buf),
+ BufferGetBlockNumber(rbuf));
/*----------
* By here,
@@ -753,7 +764,9 @@ _bt_insertonpg(Relation rel,
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
- _bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");
+ if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
+ elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+ itup_blkno, RelationGetRelationName(rel));
MarkBufferDirty(buf);
@@ -765,7 +778,7 @@ _bt_insertonpg(Relation rel,
}
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
xl_btree_insert xlrec;
BlockNumber xldownlink;
@@ -879,6 +892,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
Page origpage;
Page leftpage,
rightpage;
+ BlockNumber origpagenumber,
+ rightpagenumber;
BTPageOpaque ropaque,
lopaque,
oopaque;
@@ -894,11 +909,27 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber i;
bool isroot;
+ /* Acquire a new page to split into */
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+
+ /*
+ * origpage is the original page to be split. leftpage is a temporary
+ * buffer that receives the left-sibling data, which will be copied back
+ * into origpage on success. rightpage is the new page that receives the
+ * right-sibling data. If we fail before reaching the critical section,
+ * origpage hasn't been modified and leftpage is only workspace. In
+ * principle we shouldn't need to worry about rightpage either, because it
+ * hasn't been linked into the btree page structure; but to avoid leaving
+ * possibly-confusing junk behind, we are careful to rewrite rightpage as
+ * zeroes before throwing any error.
+ */
origpage = BufferGetPage(buf);
leftpage = PageGetTempPage(origpage);
rightpage = BufferGetPage(rbuf);
+ origpagenumber = BufferGetBlockNumber(buf);
+ rightpagenumber = BufferGetBlockNumber(rbuf);
+
_bt_pageinit(leftpage, BufferGetPageSize(buf));
/* rightpage was already initialized by _bt_getbuf */
@@ -923,8 +954,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_flags = lopaque->btpo_flags;
lopaque->btpo_prev = oopaque->btpo_prev;
- lopaque->btpo_next = BufferGetBlockNumber(rbuf);
- ropaque->btpo_prev = BufferGetBlockNumber(buf);
+ lopaque->btpo_next = rightpagenumber;
+ ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = oopaque->btpo_next;
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
/* Since we already have write-lock on both pages, ok to read cycleid */
@@ -947,9 +978,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
item = (IndexTuple) PageGetItem(origpage, itemid);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
false, false) == InvalidOffsetNumber)
- elog(PANIC, "failed to add hikey to the right sibling"
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add hikey to the right sibling"
" while splitting block %u of index \"%s\"",
- BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ origpagenumber, RelationGetRelationName(rel));
+ }
rightoff = OffsetNumberNext(rightoff);
}
@@ -974,9 +1008,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
}
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
false, false) == InvalidOffsetNumber)
- elog(PANIC, "failed to add hikey to the left sibling"
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add hikey to the left sibling"
" while splitting block %u of index \"%s\"",
- BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ origpagenumber, RelationGetRelationName(rel));
+ }
leftoff = OffsetNumberNext(leftoff);
/*
@@ -998,14 +1035,24 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
{
if (newitemonleft)
{
- _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
- "left sibling");
+ if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
leftoff = OffsetNumberNext(leftoff);
}
else
{
- _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
- "right sibling");
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
rightoff = OffsetNumberNext(rightoff);
}
}
@@ -1013,14 +1060,24 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
/* decide which page to put it on */
if (i < firstright)
{
- _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
- "left sibling");
+ if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add old item to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
leftoff = OffsetNumberNext(leftoff);
}
else
{
- _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
- "right sibling");
+ if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add old item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
rightoff = OffsetNumberNext(rightoff);
}
}
@@ -1034,8 +1091,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
* not be splitting the page).
*/
Assert(!newitemonleft);
- _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
- "right sibling");
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
rightoff = OffsetNumberNext(rightoff);
}
@@ -1047,16 +1109,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
* neighbors.
*/
- if (!P_RIGHTMOST(ropaque))
+ if (!P_RIGHTMOST(oopaque))
{
- sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+ sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
- if (sopaque->btpo_prev != ropaque->btpo_prev)
- elog(PANIC, "right sibling's left-link doesn't match: "
+ if (sopaque->btpo_prev != origpagenumber)
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "right sibling's left-link doesn't match: "
"block %u links to %u instead of expected %u in index \"%s\"",
- ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
+ oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
RelationGetRelationName(rel));
+ }
/*
* Check to see if we can set the SPLIT_END flag in the right-hand
@@ -1081,8 +1146,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
*
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
* not starting the critical section till here because we haven't been
- * scribbling on the original page yet, and we don't care about the new
- * sibling until it's linked into the btree.
+ * scribbling on the original page yet; see comments above.
*/
START_CRIT_SECTION();
@@ -1094,24 +1158,26 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
* (in the page management code) that the center of a page always be
* clean, and the most efficient way to guarantee this is just to compact
* the data by reinserting it into a new left page. (XXX the latter
- * comment is probably obsolete.)
+ * comment is probably obsolete; but in any case it's good to not scribble
+ * on the original page until we enter the critical section.)
*
* We need to do this before writing the WAL record, so that XLogInsert
* can WAL log an image of the page if necessary.
*/
PageRestoreTempPage(leftpage, origpage);
+ /* leftpage, lopaque must not be used below here */
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);
if (!P_RIGHTMOST(ropaque))
{
- sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+ sopaque->btpo_prev = rightpagenumber;
MarkBufferDirty(sbuf);
}
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
xl_btree_split xlrec;
uint8 xlinfo;
@@ -1120,8 +1186,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
XLogRecData *lastrdata;
xlrec.node = rel->rd_node;
- xlrec.leftsib = BufferGetBlockNumber(buf);
- xlrec.rightsib = BufferGetBlockNumber(rbuf);
+ xlrec.leftsib = origpagenumber;
+ xlrec.rightsib = rightpagenumber;
xlrec.rnext = ropaque->btpo_next;
xlrec.level = ropaque->btpo.level;
xlrec.firstright = firstright;
@@ -1860,7 +1926,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
MarkBufferDirty(metabuf);
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
xl_btree_newroot xlrec;
XLogRecPtr recptr;
@@ -1920,13 +1986,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* we insert the tuples in order, so that the given itup_off does
* represent the final position of the tuple!
*/
-static void
-_bt_pgaddtup(Relation rel,
- Page page,
+static bool
+_bt_pgaddtup(Page page,
Size itemsize,
IndexTuple itup,
- OffsetNumber itup_off,
- const char *where)
+ OffsetNumber itup_off)
{
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple;
@@ -1941,8 +2005,9 @@ _bt_pgaddtup(Relation rel,
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
false, false) == InvalidOffsetNumber)
- elog(PANIC, "failed to add item to the %s in index \"%s\"",
- where, RelationGetRelationName(rel));
+ return false;
+
+ return true;
}
/*
@@ -1978,9 +2043,10 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
if (isNull || (scankey->sk_flags & SK_ISNULL))
return false;
- result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
- datum,
- scankey->sk_argument));
+ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+ scankey->sk_collation,
+ datum,
+ scankey->sk_argument));
if (result != 0)
return false;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 5e9e7b9270..58f47e7b64 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -4,12 +4,12 @@
* BTree-specific page management code for the Postgres btree access
* method.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.123 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/nbtree/nbtpage.c
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -29,6 +29,7 @@
#include "storage/freespace.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "utils/inval.h"
#include "utils/snapmgr.h"
@@ -224,7 +225,7 @@ _bt_getroot(Relation rel, int access)
MarkBufferDirty(metabuf);
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
xl_btree_newroot xlrec;
XLogRecPtr recptr;
@@ -452,7 +453,7 @@ _bt_checkpage(Relation rel, Buffer buf)
static void
_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
{
- if (rel->rd_istemp)
+ if (!RelationNeedsWAL(rel))
return;
/* No ereport(ERROR) until changes are logged */
@@ -465,7 +466,6 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
/* XLOG stuff */
{
- XLogRecPtr recptr;
XLogRecData rdata[1];
xl_btree_reuse_page xlrec_reuse;
@@ -477,7 +477,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
rdata[0].buffer = InvalidBuffer;
rdata[0].next = NULL;
- recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
+ XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
/*
* We don't do PageSetLSN or PageSetTLI here because we're about
@@ -751,7 +751,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
MarkBufferDirty(buf);
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
XLogRecPtr recptr;
XLogRecData rdata[2];
@@ -829,7 +829,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,
MarkBufferDirty(buf);
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
XLogRecPtr recptr;
XLogRecData rdata[3];
@@ -1175,6 +1175,19 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
*/
rightsib = opaque->btpo_next;
rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
+ page = BufferGetPage(rbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (opaque->btpo_prev != target)
+ elog(ERROR, "right sibling's left-link doesn't match: "
+ "block %u links to %u instead of expected %u in index \"%s\"",
+ rightsib, opaque->btpo_prev, target,
+ RelationGetRelationName(rel));
+
+ /*
+ * Any insert which would have gone on the target block will now go to the
+ * right sibling block.
+ */
+ PredicateLockPageCombine(rel, target, rightsib);
/*
* Next find and write-lock the current parent of the target page. This is
@@ -1253,6 +1266,38 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
}
/*
+ * Check that the parent-page index items we're about to delete/overwrite
+ * contain what we expect. This can fail if the index has become corrupt
+ * for some reason. We want to throw any error before entering the
+ * critical section --- otherwise it'd be a PANIC.
+ *
+ * The test on the target item is just an Assert because _bt_getstackbuf
+ * should have guaranteed it has the expected contents. The test on the
+ * next-child downlink is known to sometimes fail in the field, though.
+ */
+ page = BufferGetPage(pbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+#ifdef USE_ASSERT_CHECKING
+ itemid = PageGetItemId(page, poffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
+#endif
+
+ if (!parent_half_dead)
+ {
+ OffsetNumber nextoffset;
+
+ nextoffset = OffsetNumberNext(poffset);
+ itemid = PageGetItemId(page, nextoffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
+ elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
+ rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
+ parent, RelationGetRelationName(rel));
+ }
+
+ /*
* Here we begin doing the deletion.
*/
@@ -1265,8 +1310,6 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
* to copy the right sibling's downlink over the target downlink, and then
* delete the following item.
*/
- page = BufferGetPage(pbuf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (parent_half_dead)
{
PageIndexTupleDelete(page, poffset);
@@ -1278,23 +1321,16 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
itemid = PageGetItemId(page, poffset);
itup = (IndexTuple) PageGetItem(page, itemid);
- Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
nextoffset = OffsetNumberNext(poffset);
- /* This part is just for double-checking */
- itemid = PageGetItemId(page, nextoffset);
- itup = (IndexTuple) PageGetItem(page, itemid);
- if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
- elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"",
- rightsib, target, BufferGetBlockNumber(pbuf),
- RelationGetRelationName(rel));
PageIndexTupleDelete(page, nextoffset);
}
/*
* Update siblings' side-links. Note the target page's side-links will
- * continue to point to the siblings.
+ * continue to point to the siblings. Asserts here are just rechecking
+ * things we already verified above.
*/
if (BufferIsValid(lbuf))
{
@@ -1335,7 +1371,7 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
MarkBufferDirty(lbuf);
/* XLOG stuff */
- if (!rel->rd_istemp)
+ if (RelationNeedsWAL(rel))
{
xl_btree_delete_page xlrec;
xl_btree_metadata xlmeta;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 0fcde95ccd..6a7ddd7db4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -8,11 +8,11 @@
* This file contains only the public interface routines.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.177 2010/03/28 09:27:01 sriggs Exp $
+ * src/backend/access/nbtree/nbtree.c
*
*-------------------------------------------------------------------------
*/
@@ -29,6 +29,9 @@
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
#include "utils/memutils.h"
@@ -205,6 +208,36 @@ btbuildCallback(Relation index,
}
/*
+ * btbuildempty() -- build an empty btree index in the initialization fork
+ */
+Datum
+btbuildempty(PG_FUNCTION_ARGS)
+{
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Page metapage;
+
+ /* Construct metapage. */
+ metapage = (Page) palloc(BLCKSZ);
+ _bt_initmetapage(metapage, P_NONE, 0);
+
+ /* Write the page. If archiving/streaming, XLOG it. */
+ smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
+ (char *) metapage, true);
+ if (XLogIsNeeded())
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BTREE_METAPAGE, metapage);
+
+ /*
+ * An immediate sync is require even if we xlog'd the page, because the
+ * write did not go through shared_buffers and therefore a concurrent
+ * checkpoint may have move the redo pointer past our xlog record.
+ */
+ smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+
+ PG_RETURN_VOID();
+}
+
+/*
* btinsert() -- insert an index tuple into a btree.
*
* Descend the tree recursively, find the appropriate location for our
@@ -337,12 +370,27 @@ Datum
btbeginscan(PG_FUNCTION_ARGS)
{
Relation rel = (Relation) PG_GETARG_POINTER(0);
- int keysz = PG_GETARG_INT32(1);
- ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
+ int nkeys = PG_GETARG_INT32(1);
+ int norderbys = PG_GETARG_INT32(2);
IndexScanDesc scan;
+ BTScanOpaque so;
+
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
/* get the scan */
- scan = RelationGetIndexScan(rel, keysz, scankey);
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+ /* allocate private workspace */
+ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+ so->currPos.buf = so->markPos.buf = InvalidBuffer;
+ if (scan->numberOfKeys > 0)
+ so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+ else
+ so->keyData = NULL;
+ so->killedItems = NULL; /* until needed */
+ so->numKilled = 0;
+ scan->opaque = so;
PG_RETURN_POINTER(scan);
}
@@ -355,22 +403,9 @@ btrescan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
- BTScanOpaque so;
- so = (BTScanOpaque) scan->opaque;
-
- if (so == NULL) /* if called from btbeginscan */
- {
- so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
- so->currPos.buf = so->markPos.buf = InvalidBuffer;
- if (scan->numberOfKeys > 0)
- so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
- else
- so->keyData = NULL;
- so->killedItems = NULL; /* until needed */
- so->numKilled = 0;
- scan->opaque = so;
- }
+ /* remaining arguments are ignored */
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
/* we aren't holding any read locks, but gotta drop the pins */
if (BTScanPosIsValid(so->currPos))
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fc11829495..2ce2bc2f00 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -4,11 +4,11 @@
* Search code for postgres btrees.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.121 2010/01/02 16:57:35 momjian Exp $
+ * src/backend/access/nbtree/nbtsearch.c
*
*-------------------------------------------------------------------------
*/
@@ -21,6 +21,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
+#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
@@ -63,7 +64,10 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
/* If index is empty and access = BT_READ, no root page is created. */
if (!BufferIsValid(*bufP))
+ {
+ PredicateLockRelation(rel); /* Nothing finer to lock exists. */
return (BTStack) NULL;
+ }
/* Loop iterates once per level descended in the tree */
for (;;)
@@ -88,7 +92,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISLEAF(opaque))
+ {
+ if (access == BT_READ)
+ PredicateLockPage(rel, BufferGetBlockNumber(*bufP));
break;
+ }
/*
* Find the appropriate item on the internal page, and get the child
@@ -402,9 +410,10 @@ _bt_compare(Relation rel,
* to flip the sign of the comparison result. (Unless it's a DESC
* column, in which case we *don't* flip the sign.)
*/
- result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
- datum,
- scankey->sk_argument));
+ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+ scankey->sk_collation,
+ datum,
+ scankey->sk_argument));
if (!(scankey->sk_flags & SK_BT_DESC))
result = -result;
@@ -713,6 +722,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
cur->sk_attno,
InvalidStrategy,
cur->sk_subtype,
+ cur->sk_collation,
procinfo,
cur->sk_argument);
}
@@ -733,6 +743,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
cur->sk_attno,
InvalidStrategy,
cur->sk_subtype,
+ cur->sk_collation,
cmp_proc,
cur->sk_argument);
}
@@ -1142,6 +1153,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(opaque))
{
+ PredicateLockPage(rel, blkno);
/* see if there are any matches on this page */
/* note that this will clear moreRight if we can stop */
if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
@@ -1189,6 +1201,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(opaque))
{
+ PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf));
/* see if there are any matches on this page */
/* note that this will clear moreLeft if we can stop */
if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
@@ -1352,6 +1365,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
if (!BufferIsValid(buf))
{
/* empty index... */
+ PredicateLockRelation(rel); /* Nothing finer to lock exists. */
return InvalidBuffer;
}
@@ -1431,10 +1445,12 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
if (!BufferIsValid(buf))
{
/* empty index... */
+ PredicateLockRelation(rel); /* Nothing finer to lock exists. */
so->currPos.buf = InvalidBuffer;
return false;
}
+ PredicateLockPage(rel, BufferGetBlockNumber(buf));
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISLEAF(opaque));
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 15964e127e..93a928c66b 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -55,11 +55,11 @@
* This code isn't concerned about the FSM at all. The caller is responsible
* for initializing that.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.125 2010/04/28 16:10:40 heikki Exp $
+ * src/backend/access/nbtree/nbtsort.c
*
*-------------------------------------------------------------------------
*/
@@ -70,6 +70,7 @@
#include "access/nbtree.h"
#include "miscadmin.h"
#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
#include "utils/rel.h"
#include "utils/tuplesort.h"
@@ -211,9 +212,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
/*
* We need to log index creation in WAL iff WAL archiving/streaming is
- * enabled AND it's not a temp index.
+ * enabled UNLESS the index isn't WAL-logged anyway.
*/
- wstate.btws_use_wal = XLogIsNeeded() && !wstate.index->rd_istemp;
+ wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
/* reserve the metapage */
wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
@@ -295,9 +296,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
}
/*
- * Now write the page. We say isTemp = true even if it's not a temp
- * index, because there's no need for smgr to schedule an fsync for this
- * write; we'll do it ourselves before ending the build.
+ * Now write the page. There's no need for smgr to schedule an fsync for
+ * this write; we'll do it ourselves before ending the build.
*/
if (blkno == wstate->btws_pages_written)
{
@@ -736,9 +736,11 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
}
else
{
- compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
- attrDatum1,
- attrDatum2));
+ compare =
+ DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
+ entry->sk_collation,
+ attrDatum1,
+ attrDatum2));
if (entry->sk_flags & SK_BT_DESC)
compare = -compare;
@@ -798,9 +800,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
_bt_uppershutdown(wstate, state);
/*
- * If the index isn't temp, we must fsync it down to disk before it's safe
- * to commit the transaction. (For a temp index we don't care since the
- * index will be uninteresting after a crash anyway.)
+ * If the index is WAL-logged, we must fsync it down to disk before it's
+ * safe to commit the transaction. (For a non-WAL-logged index we don't
+ * care since the index will be uninteresting after a crash anyway.)
*
* It's obvious that we must do this when not WAL-logging the build. It's
* less obvious that we have to do it even if we did WAL-log the index
@@ -812,7 +814,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
* fsync those pages here, they might still not be on disk when the crash
* occurs.
*/
- if (!wstate->index->rd_istemp)
+ if (RelationNeedsWAL(wstate->index))
{
RelationOpenSmgr(wstate->index);
smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 6b399d34a6..2e896a258f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3,12 +3,12 @@
* nbtutils.c
* Utility code for Postgres btree implementation.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.98 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/nbtree/nbtutils.c
*
*-------------------------------------------------------------------------
*/
@@ -80,6 +80,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
(AttrNumber) (i + 1),
InvalidStrategy,
InvalidOid,
+ rel->rd_indcollation[i],
procinfo,
arg);
}
@@ -127,6 +128,7 @@ _bt_mkscankey_nodata(Relation rel)
(AttrNumber) (i + 1),
InvalidStrategy,
InvalidOid,
+ rel->rd_indcollation[i],
procinfo,
(Datum) 0);
}
@@ -600,9 +602,10 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
*/
if (lefttype == opcintype && righttype == optype)
{
- *result = DatumGetBool(FunctionCall2(&op->sk_func,
- leftarg->sk_argument,
- rightarg->sk_argument));
+ *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
+ op->sk_collation,
+ leftarg->sk_argument,
+ rightarg->sk_argument));
return true;
}
@@ -629,9 +632,10 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
if (RegProcedureIsValid(cmp_proc))
{
- *result = DatumGetBool(OidFunctionCall2(cmp_proc,
- leftarg->sk_argument,
- rightarg->sk_argument));
+ *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
+ op->sk_collation,
+ leftarg->sk_argument,
+ rightarg->sk_argument));
return true;
}
}
@@ -685,6 +689,10 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
* Likewise, "x IS NOT NULL" is supported. We treat that as either "less
* than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
* FIRST index.
+ *
+ * Note: someday we might have to fill in sk_collation from the index
+ * column's collation. At the moment this is a non-issue because we'll
+ * never actually call the comparison operator on a NULL.
*/
if (skey->sk_flags & SK_ISNULL)
{
@@ -699,6 +707,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
{
skey->sk_strategy = BTEqualStrategyNumber;
skey->sk_subtype = InvalidOid;
+ skey->sk_collation = InvalidOid;
}
else if (skey->sk_flags & SK_SEARCHNOTNULL)
{
@@ -707,6 +716,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
else
skey->sk_strategy = BTLessStrategyNumber;
skey->sk_subtype = InvalidOid;
+ skey->sk_collation = InvalidOid;
}
else
{
@@ -972,7 +982,8 @@ _bt_checkkeys(IndexScanDesc scan,
return false;
}
- test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
+ test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+ datum, key->sk_argument);
if (!DatumGetBool(test))
{
@@ -1095,9 +1106,10 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
}
/* Perform the test --- three-way comparison not bool operator */
- cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func,
- datum,
- subkey->sk_argument));
+ cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
+ subkey->sk_collation,
+ datum,
+ subkey->sk_argument));
if (subkey->sk_flags & SK_BT_DESC)
cmpresult = -cmpresult;
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 3261483475..2775ae6d29 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -4,11 +4,11 @@
* WAL replay logic for btrees.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.69 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/nbtree/nbtxlog.c
*
*-------------------------------------------------------------------------
*/
@@ -580,7 +580,6 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
BlockNumber hblkno;
OffsetNumber hoffnum;
TransactionId latestRemovedXid = InvalidTransactionId;
- TransactionId htupxid = InvalidTransactionId;
int i;
/*
@@ -646,24 +645,16 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
}
/*
- * If the heap item has storage, then read the header. Some LP_DEAD
- * items may not be accessible, so we ignore them.
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
*/
if (ItemIdHasStorage(hitemid))
{
htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
- /*
- * Get the heap tuple's xmin/xmax and ratchet up the
- * latestRemovedXid. No need to consider xvac values here.
- */
- htupxid = HeapTupleHeaderGetXmin(htuphdr);
- if (TransactionIdFollows(htupxid, latestRemovedXid))
- latestRemovedXid = htupxid;
-
- htupxid = HeapTupleHeaderGetXmax(htuphdr);
- if (TransactionIdFollows(htupxid, latestRemovedXid))
- latestRemovedXid = htupxid;
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
}
else if (ItemIdIsDead(hitemid))
{
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index fe34e4eaaa..849ab384d5 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/transam
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/transam/Makefile,v 1.22 2008/02/19 10:30:07 petere Exp $
+# src/backend/access/transam/Makefile
#
#-------------------------------------------------------------------------
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 05c41d487c..eaac1393b8 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.13 2009/12/19 01:32:33 sriggs Exp $
+src/backend/access/transam/README
The Transaction System
======================
@@ -543,6 +543,85 @@ consistency. Such insertions occur after WAL is operational, so they can
and should write WAL records for the additional generated actions.
+Write-Ahead Logging for Filesystem Actions
+------------------------------------------
+
+The previous section described how to WAL-log actions that only change page
+contents within shared buffers. For that type of action it is generally
+possible to check all likely error cases (such as insufficient space on the
+page) before beginning to make the actual change. Therefore we can make
+the change and the creation of the associated WAL log record "atomic" by
+wrapping them into a critical section --- the odds of failure partway
+through are low enough that PANIC is acceptable if it does happen.
+
+Clearly, that approach doesn't work for cases where there's a significant
+probability of failure within the action to be logged, such as creation
+of a new file or database. We don't want to PANIC, and we especially don't
+want to PANIC after having already written a WAL record that says we did
+the action --- if we did, replay of the record would probably fail again
+and PANIC again, making the failure unrecoverable. This means that the
+ordinary WAL rule of "write WAL before the changes it describes" doesn't
+work, and we need a different design for such cases.
+
+There are several basic types of filesystem actions that have this
+issue. Here is how we deal with each:
+
+1. Adding a disk page to an existing table.
+
+This action isn't WAL-logged at all. We extend a table by writing a page
+of zeroes at its end. We must actually do this write so that we are sure
+the filesystem has allocated the space. If the write fails we can just
+error out normally. Once the space is known allocated, we can initialize
+and fill the page via one or more normal WAL-logged actions. Because it's
+possible that we crash between extending the file and writing out the WAL
+entries, we have to treat discovery of an all-zeroes page in a table or
+index as being a non-error condition. In such cases we can just reclaim
+the space for re-use.
+
+2. Creating a new table, which requires a new file in the filesystem.
+
+We try to create the file, and if successful we make a WAL record saying
+we did it. If not successful, we can just throw an error. Notice that
+there is a window where we have created the file but not yet written any
+WAL about it to disk. If we crash during this window, the file remains
+on disk as an "orphan". It would be possible to clean up such orphans
+by having database restart search for files that don't have any committed
+entry in pg_class, but that currently isn't done because of the possibility
+of deleting data that is useful for forensic analysis of the crash.
+Orphan files are harmless --- at worst they waste a bit of disk space ---
+because we check for on-disk collisions when allocating new relfilenode
+OIDs. So cleaning up isn't really necessary.
+
+3. Deleting a table, which requires an unlink() that could fail.
+
+Our approach here is to WAL-log the operation first, but to treat failure
+of the actual unlink() call as a warning rather than error condition.
+Again, this can leave an orphan file behind, but that's cheap compared to
+the alternatives. Since we can't actually do the unlink() until after
+we've committed the DROP TABLE transaction, throwing an error would be out
+of the question anyway. (It may be worth noting that the WAL entry about
+the file deletion is actually part of the commit record for the dropping
+transaction.)
+
+4. Creating and deleting databases and tablespaces, which requires creating
+and deleting directories and entire directory trees.
+
+These cases are handled similarly to creating individual files, ie, we
+try to do the action first and then write a WAL entry if it succeeded.
+The potential amount of wasted disk space is rather larger, of course.
+In the creation case we try to delete the directory tree again if creation
+fails, so as to reduce the risk of wasted space. Failure partway through
+a deletion operation results in a corrupt database: the DROP failed, but
+some of the data is gone anyway. There is little we can do about that,
+though, and in any case it was presumably data the user no longer wants.
+
+In all of these cases, if WAL replay fails to redo the original action
+we must panic and abort recovery. The DBA will have to manually clean up
+(for instance, free up some disk space or fix directory permissions) and
+then restart recovery. This is part of the reason for not writing a WAL
+entry until we've successfully done the original action.
+
+
Asynchronous Commit
-------------------
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 1d29d80c75..2a1b4ae111 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -23,11 +23,11 @@
* for aborts (whether sync or async), since the post-crash assumption would
* be that such transactions failed anyway.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.55 2010/01/02 16:57:35 momjian Exp $
+ * src/backend/access/transam/clog.c
*
*-------------------------------------------------------------------------
*/
@@ -442,8 +442,8 @@ CLOGShmemInit(void)
/*
* This func must be called ONCE on system install. It creates
* the initial CLOG segment. (The CLOG directory is assumed to
- * have been created by the initdb shell script, and CLOGShmemInit
- * must have been called already.)
+ * have been created by initdb, and CLOGShmemInit must have been
+ * called already.)
*/
void
BootStrapCLOG(void)
@@ -456,7 +456,7 @@ BootStrapCLOG(void)
slotno = ZeroCLOGPage(0, false);
/* Make sure it's written out */
- SimpleLruWritePage(ClogCtl, slotno, NULL);
+ SimpleLruWritePage(ClogCtl, slotno);
Assert(!ClogCtl->shared->page_dirty[slotno]);
LWLockRelease(CLogControlLock);
@@ -745,7 +745,7 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
slotno = ZeroCLOGPage(pageno, false);
- SimpleLruWritePage(ClogCtl, slotno, NULL);
+ SimpleLruWritePage(ClogCtl, slotno);
Assert(!ClogCtl->shared->page_dirty[slotno]);
LWLockRelease(CLogControlLock);
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3f3bdc0335..bb3afd6000 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -39,10 +39,10 @@
* anything we saw during replay.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.35 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/transam/multixact.c
*
*-------------------------------------------------------------------------
*/
@@ -1454,7 +1454,7 @@ BootStrapMultiXact(void)
slotno = ZeroMultiXactOffsetPage(0, false);
/* Make sure it's written out */
- SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL);
+ SimpleLruWritePage(MultiXactOffsetCtl, slotno);
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
LWLockRelease(MultiXactOffsetControlLock);
@@ -1465,7 +1465,7 @@ BootStrapMultiXact(void)
slotno = ZeroMultiXactMemberPage(0, false);
/* Make sure it's written out */
- SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL);
+ SimpleLruWritePage(MultiXactMemberCtl, slotno);
Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
LWLockRelease(MultiXactMemberControlLock);
@@ -1986,7 +1986,7 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
slotno = ZeroMultiXactOffsetPage(pageno, false);
- SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL);
+ SimpleLruWritePage(MultiXactOffsetCtl, slotno);
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
LWLockRelease(MultiXactOffsetControlLock);
@@ -2001,7 +2001,7 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
slotno = ZeroMultiXactMemberPage(pageno, false);
- SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL);
+ SimpleLruWritePage(MultiXactMemberCtl, slotno);
Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
LWLockRelease(MultiXactMemberControlLock);
diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample
index 855c11318f..2fee00e518 100644
--- a/src/backend/access/transam/recovery.conf.sample
+++ b/src/backend/access/transam/recovery.conf.sample
@@ -3,8 +3,8 @@
# -------------------------------
#
# Edit this file to provide the parameters that PostgreSQL needs to
-# perform an archive recovery of a database, or to act as a log-streaming
-# replication standby.
+# perform an archive recovery of a database, or to act as a replication
+# standby.
#
# If "recovery.conf" is present in the PostgreSQL data directory, it is
# read on postmaster startup. After successful recovery, it is renamed
@@ -13,9 +13,7 @@
#
# This file consists of lines of the form:
#
-# name = 'value'
-#
-# (The quotes around the value are NOT optional, but the "=" is.)
+# name = value
#
# Comments are introduced with '#'.
#
@@ -68,11 +66,14 @@
# If you want to stop rollforward at a specific point, you
# must set a recovery target.
#
-# You may set a recovery target either by transactionId,
-# by timestamp or by barrier id. Recovery may either include or exclude the
-# transaction(s) with the recovery target value in case of timestamp or
-# transactionId (ie, stop either just after or just before the given target,
-# respectively). In case of barrier, the recovery stops exactly at that point
+# You may set a recovery target either by transactionId, by name,
+# or by timestamp or by barrier. Recovery may either include or exclude the
+# transaction(s) with the recovery target value (ie, stop either
+# just after or just before the given target, respectively). In case of
+# barrier, the recovery stops exactly at that point.
+#
+#
+#recovery_target_name = '' # e.g. 'daily backup 2011-01-26'
#
#recovery_target_time = '' # e.g. '2004-07-14 22:39:00 EST'
#
@@ -80,7 +81,7 @@
#
#recovery_target_barrier = ''
#
-#recovery_target_inclusive = 'true'
+#recovery_target_inclusive = true
#
#
# If you want to recover into a timeline other than the "main line" shown in
@@ -89,25 +90,39 @@
#
#recovery_target_timeline = 'latest'
#
+#
+# If pause_at_recovery_target is enabled, recovery will pause when
+# the recovery target is reached. The pause state will continue until
+# pg_xlog_replay_resume() is called. This setting has no effect if
+# hot standby is not enabled, or if no recovery target is set.
+#
+#pause_at_recovery_target = true
+#
#---------------------------------------------------------------------------
# STANDBY SERVER PARAMETERS
#---------------------------------------------------------------------------
#
-# When standby_mode is enabled, the PostgreSQL server will work as
-# a standby. It tries to connect to the primary according to the
-# connection settings primary_conninfo, and receives XLOG records
-# continuously.
+# standby_mode
+#
+# When standby_mode is enabled, the PostgreSQL server will work as a
+# standby. It will continuously wait for the additional XLOG records, using
+# restore_command and/or primary_conninfo.
+#
+#standby_mode = off
+#
+# primary_conninfo
#
-#standby_mode = 'off'
+# If set, the PostgreSQL server will try to connect to the primary using this
+# connection string and receive XLOG records continuously.
#
#primary_conninfo = '' # e.g. 'host=localhost port=5432'
#
#
-# By default, a standby server keeps streaming XLOG records from the
-# primary indefinitely. If you want to stop streaming and finish recovery,
-# opening up the system in read/write mode, specify path to a trigger file.
-# Server will poll the trigger file path periodically and stop streaming
-# when it's found.
+# By default, a standby server keeps restoring XLOG records from the
+# primary indefinitely. If you want to stop the standby mode, finish recovery
+# and open the system in read/write mode, specify path to a trigger file.
+# The server will poll the trigger file path periodically and start as a
+# primary server when it's found.
#
#trigger_file = ''
#
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index d989a59197..734d258002 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
*
* Resource managers definition
*
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.29 2010/02/07 20:48:09 tgl Exp $
+ * src/backend/access/transam/rmgr.c
*/
#include "postgres.h"
@@ -42,7 +42,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
{"Hash", hash_redo, hash_desc, NULL, NULL, NULL},
{"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint},
- {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint},
+ {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL},
{"Sequence", seq_redo, seq_desc, NULL, NULL, NULL}
#ifdef PGXC
,
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d122a15ff6..f4cc6282eb 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -38,10 +38,10 @@
* by re-setting the page's page_dirty flag.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.50 2010/04/28 16:54:15 tgl Exp $
+ * src/backend/access/transam/slru.c
*
*-------------------------------------------------------------------------
*/
@@ -78,6 +78,8 @@ typedef struct SlruFlushData
int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */
} SlruFlushData;
+typedef struct SlruFlushData *SlruFlush;
+
/*
* Macro to mark a buffer slot "most recently used". Note multiple evaluation
* of arguments!
@@ -123,6 +125,7 @@ static int slru_errno;
static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
+static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
SlruFlush fdata);
@@ -485,8 +488,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
*
* Control lock must be held at entry, and will be held at exit.
*/
-void
-SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
+static void
+SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
{
SlruShared shared = ctl->shared;
int pageno = shared->page_number[slotno];
@@ -567,6 +570,17 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
}
/*
+ * Wrapper of SlruInternalWritePage, for external callers.
+ * fdata is always passed a NULL here.
+ */
+void
+SimpleLruWritePage(SlruCtl ctl, int slotno)
+{
+ SlruInternalWritePage(ctl, slotno, NULL);
+}
+
+
+/*
* Physical read of a (previously existing) page into a buffer slot
*
* On failure, we cannot just ereport(ERROR) since caller has put state in
@@ -989,7 +1003,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
* we wait for the existing I/O to complete.
*/
if (shared->page_status[bestslot] == SLRU_PAGE_VALID)
- SimpleLruWritePage(ctl, bestslot, NULL);
+ SlruInternalWritePage(ctl, bestslot, NULL);
else
SimpleLruWaitIO(ctl, bestslot);
@@ -1023,7 +1037,7 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
for (slotno = 0; slotno < shared->num_slots; slotno++)
{
- SimpleLruWritePage(ctl, slotno, &fdata);
+ SlruInternalWritePage(ctl, slotno, &fdata);
/*
* When called during a checkpoint, we cannot assert that the slot is
@@ -1128,7 +1142,7 @@ restart:;
* keep the logic the same as it was.)
*/
if (shared->page_status[slotno] == SLRU_PAGE_VALID)
- SimpleLruWritePage(ctl, slotno, NULL);
+ SlruInternalWritePage(ctl, slotno, NULL);
else
SimpleLruWaitIO(ctl, slotno);
goto restart;
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 7e01a90710..a6d9d7f0b5 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -19,11 +19,11 @@
* data across crashes. During database startup, we simply force the
* currently-active page of SUBTRANS to zeroes.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
- * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.27 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/transam/subtrans.c
*
*-------------------------------------------------------------------------
*/
@@ -210,7 +210,7 @@ BootStrapSUBTRANS(void)
slotno = ZeroSUBTRANSPage(0);
/* Make sure it's written out */
- SimpleLruWritePage(SubTransCtl, slotno, NULL);
+ SimpleLruWritePage(SubTransCtl, slotno);
Assert(!SubTransCtl->shared->page_dirty[slotno]);
LWLockRelease(SubtransControlLock);
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index 0c2e2dd824..bc02f15e86 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -3,12 +3,12 @@
* transam.c
* postgres transaction log interface routines
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.80 2010/01/02 16:57:35 momjian Exp $
+ * src/backend/access/transam/transam.c
*
* NOTES
* This file contains the high level access-method interface to the
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 8ab6d31e7f..ee327ebb59 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -3,12 +3,12 @@
* twophase.c
* Two-phase commit support functions.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.62 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/transam/twophase.c
*
* NOTES
* Each global transaction is associated with a global transaction
@@ -56,7 +56,10 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "replication/walsender.h"
+#include "replication/syncrep.h"
#include "storage/fd.h"
+#include "storage/predicate.h"
#include "storage/procarray.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
@@ -122,7 +125,7 @@ typedef struct GlobalTransactionData
TransactionId locking_xid; /* top-level XID of backend working on xact */
bool valid; /* TRUE if fully prepared */
char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
-} GlobalTransactionData;
+} GlobalTransactionData;
/*
* Two Phase Commit shared state. Access to this struct is protected
@@ -870,8 +873,8 @@ StartPrepare(GlobalTransaction gxact)
hdr.prepared_at = gxact->prepared_at;
hdr.owner = gxact->owner;
hdr.nsubxacts = xactGetCommittedChildren(&children);
- hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels, NULL);
- hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels, NULL);
+ hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
+ hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs,
&hdr.initfileinval);
StrNCpy(hdr.gid, gxact->gid, GIDSIZE);
@@ -1032,6 +1035,13 @@ EndPrepare(GlobalTransaction gxact)
/* If we crash now, we have prepared: WAL replay will fix things */
+ /*
+ * Wake up all walsenders to send WAL up to the PREPARE record immediately
+ * if replication is enabled
+ */
+ if (max_wal_senders > 0)
+ WalSndWakeup();
+
/* write correct CRC and close file */
if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
{
@@ -1069,6 +1079,14 @@ EndPrepare(GlobalTransaction gxact)
END_CRIT_SECTION();
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked the prepare, but still show as
+ * running in the procarray (twice!) and continue to hold locks.
+ */
+ SyncRepWaitForLSN(gxact->prepare_lsn);
+
records.tail = records.head = NULL;
}
@@ -1327,13 +1345,13 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
}
for (i = 0; i < ndelrels; i++)
{
- SMgrRelation srel = smgropen(delrels[i]);
+ SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
ForkNumber fork;
for (fork = 0; fork <= MAX_FORKNUM; fork++)
{
if (smgrexists(srel, fork))
- smgrdounlink(srel, fork, false, false);
+ smgrdounlink(srel, fork, false);
}
smgrclose(srel);
}
@@ -1356,6 +1374,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
else
ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
+ PredicateLockTwoPhaseFinish(xid, isCommit);
+
/* Count the prepared xact as committed or aborted */
AtEOXact_PgStat(isCommit);
@@ -2012,6 +2032,13 @@ RecordTransactionCommitPrepared(TransactionId xid,
/* Flush XLOG to disk */
XLogFlush(recptr);
+ /*
+ * Wake up all walsenders to send WAL up to the COMMIT PREPARED record
+ * immediately if replication is enabled
+ */
+ if (max_wal_senders > 0)
+ WalSndWakeup();
+
/* Mark the transaction committed in pg_clog */
TransactionIdCommitTree(xid, nchildren, children);
@@ -2019,6 +2046,14 @@ RecordTransactionCommitPrepared(TransactionId xid,
MyProc->inCommit = false;
END_CRIT_SECTION();
+
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ SyncRepWaitForLSN(recptr);
}
/*
@@ -2085,10 +2120,25 @@ RecordTransactionAbortPrepared(TransactionId xid,
XLogFlush(recptr);
/*
+ * Wake up all walsenders to send WAL up to the ABORT PREPARED record
+ * immediately if replication is enabled
+ */
+ if (max_wal_senders > 0)
+ WalSndWakeup();
+
+ /*
* Mark the transaction aborted in clog. This is not absolutely necessary
* but we may as well do it while we are here.
*/
TransactionIdAbortTree(xid, nchildren, children);
END_CRIT_SECTION();
+
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ SyncRepWaitForLSN(recptr);
}
diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c
index d8f7fb6a03..47c15af241 100644
--- a/src/backend/access/transam/twophase_rmgr.c
+++ b/src/backend/access/transam/twophase_rmgr.c
@@ -3,12 +3,12 @@
* twophase_rmgr.c
* Two-phase-commit resource managers tables
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.14 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/transam/twophase_rmgr.c
*
*-------------------------------------------------------------------------
*/
@@ -18,12 +18,14 @@
#include "access/twophase_rmgr.h"
#include "pgstat.h"
#include "storage/lock.h"
+#include "storage/predicate.h"
const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
{
NULL, /* END ID */
lock_twophase_recover, /* Lock */
+ predicatelock_twophase_recover, /* PredicateLock */
NULL, /* pgstat */
multixact_twophase_recover /* MultiXact */
};
@@ -32,6 +34,7 @@ const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
{
NULL, /* END ID */
lock_twophase_postcommit, /* Lock */
+ NULL, /* PredicateLock */
pgstat_twophase_postcommit, /* pgstat */
multixact_twophase_postcommit /* MultiXact */
};
@@ -40,6 +43,7 @@ const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
{
NULL, /* END ID */
lock_twophase_postabort, /* Lock */
+ NULL, /* PredicateLock */
pgstat_twophase_postabort, /* pgstat */
multixact_twophase_postabort /* MultiXact */
};
@@ -48,6 +52,7 @@ const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1
{
NULL, /* END ID */
lock_twophase_standby_recover, /* Lock */
+ NULL, /* PredicateLock */
NULL, /* pgstat */
NULL /* MultiXact */
};
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 4a829480f2..a5ff7537b3 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -3,11 +3,11 @@
* varsup.c
* postgres OID & XID variables support routines
*
- * Copyright (c) 2000-2010, PostgreSQL Global Development Group
+ * Copyright (c) 2000-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.91 2010/02/26 02:00:34 momjian Exp $
+ * src/backend/access/transam/varsup.c
*
*-------------------------------------------------------------------------
*/
@@ -17,6 +17,7 @@
#include "access/clog.h"
#include "access/subtrans.h"
#include "access/transam.h"
+#include "access/xact.h"
#include "commands/dbcommands.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
@@ -512,13 +513,22 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
/* Give an immediate warning if past the wrap warn point */
if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery)
{
- char *oldest_datname = get_database_name(oldest_datoid);
+ char *oldest_datname;
/*
- * Note: it's possible that get_database_name fails and returns NULL,
- * for example because the database just got dropped. We'll still
- * warn, even though the warning might now be unnecessary.
+ * We can be called when not inside a transaction, for example during
+ * StartupXLOG(). In such a case we cannot do database access, so we
+ * must just report the oldest DB's OID.
+ *
+ * Note: it's also possible that get_database_name fails and returns
+ * NULL, for example because the database just got dropped. We'll
+ * still warn, even though the warning might now be unnecessary.
*/
+ if (IsTransactionState())
+ oldest_datname = get_database_name(oldest_datoid);
+ else
+ oldest_datname = NULL;
+
if (oldest_datname)
ereport(WARNING,
(errmsg("database \"%s\" must be vacuumed within %u transactions",
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 982f500c50..941f6aa312 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5,13 +5,13 @@
*
* See src/backend/access/transam/README for more information.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.293 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/transam/xact.c
*
*-------------------------------------------------------------------------
*/
@@ -46,9 +46,12 @@
#include "libpq/be-fsstubs.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "replication/walsender.h"
+#include "replication/syncrep.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "storage/procarray.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
@@ -71,7 +74,10 @@ int XactIsoLevel;
bool DefaultXactReadOnly = false;
bool XactReadOnly;
-bool XactSyncCommit = true;
+bool DefaultXactDeferrable = false;
+bool XactDeferrable;
+
+int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
@@ -498,10 +504,32 @@ AssignTransactionId(TransactionState s)
/*
* Ensure parent(s) have XIDs, so that a child always has an XID later
- * than its parent.
+ * than its parent. Musn't recurse here, or we might get a stack overflow
+ * if we're at the bottom of a huge stack of subtransactions none of which
+ * have XIDs yet.
*/
if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
- AssignTransactionId(s->parent);
+ {
+ TransactionState p = s->parent;
+ TransactionState *parents;
+ size_t parentOffset = 0;
+
+ parents = palloc(sizeof(TransactionState) * s->nestingLevel);
+ while (p != NULL && !TransactionIdIsValid(p->transactionId))
+ {
+ parents[parentOffset++] = p;
+ p = p->parent;
+ }
+
+ /*
+ * This is technically a recursive call, but the recursion will never
+ * be more than one layer deep.
+ */
+ while (parentOffset != 0)
+ AssignTransactionId(parents[--parentOffset]);
+
+ pfree(parents);
+ }
/*
* Generate a new Xid and record it in PG_PROC and pg_subtrans.
@@ -538,6 +566,13 @@ AssignTransactionId(TransactionState s)
SubTransSetParent(s->transactionId, s->parent->transactionId, false);
/*
+ * If it's a top-level transaction, the predicate locking system needs to
+ * be told about it too.
+ */
+ if (!isSubXact)
+ RegisterPredicateLockingXid(s->transactionId);
+
+ /*
* Acquire lock on the transaction XID. (We assume this cannot block.) We
* have to ensure that the lock is assigned to the transaction's own
* ResourceOwner.
@@ -874,17 +909,6 @@ CommandCounterIncrement(void)
*/
AtCCI_LocalCache();
}
-
- /*
- * Make any other backends' catalog changes visible to me.
- *
- * XXX this is probably in the wrong place: CommandCounterIncrement should
- * be purely a local operation, most likely. However fooling with this
- * will affect asynchronous cross-backend interactions, which doesn't seem
- * like a wise thing to do in late beta, so save improving this for
- * another day - tgl 2007-11-30
- */
- AtStart_Cache();
}
/*
@@ -1058,18 +1082,20 @@ RecordTransactionCommit(void)
TransactionId latestXid = InvalidTransactionId;
int nrels;
RelFileNode *rels;
- bool haveNonTemp;
int nchildren;
TransactionId *children;
- int nmsgs;
+ int nmsgs = 0;
SharedInvalidationMessage *invalMessages = NULL;
- bool RelcacheInitFileInval;
+ bool RelcacheInitFileInval = false;
+ bool wrote_xlog;
/* Get data needed for commit record */
- nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
+ nrels = smgrGetPendingDeletes(true, &rels);
nchildren = xactGetCommittedChildren(&children);
- nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
- &RelcacheInitFileInval);
+ if (XLogStandbyInfoActive())
+ nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
+ &RelcacheInitFileInval);
+ wrote_xlog = (XactLastRecEnd.xrecoff != 0);
/*
* If we haven't been assigned an XID yet, we neither can, nor do we want
@@ -1096,7 +1122,7 @@ RecordTransactionCommit(void)
* assigned is a sequence advance record due to nextval() --- we want
* to flush that to disk before reporting commit.)
*/
- if (XactLastRecEnd.xrecoff == 0)
+ if (!wrote_xlog)
goto cleanup;
}
else
@@ -1189,16 +1215,30 @@ RecordTransactionCommit(void)
}
/*
- * Check if we want to commit asynchronously. If the user has set
- * synchronous_commit = off, and we're not doing cleanup of any non-temp
- * rels nor committing any command that wanted to force sync commit, then
- * we can defer flushing XLOG. (We must not allow asynchronous commit if
- * there are any non-temp tables to be deleted, because we might delete
- * the files before the COMMIT record is flushed to disk. We do allow
- * asynchronous commit if all to-be-deleted tables are temporary though,
- * since they are lost anyway if we crash.)
- */
- if (XactSyncCommit || forceSyncCommit || haveNonTemp)
+ * Check if we want to commit asynchronously. We can allow the XLOG flush
+ * to happen asynchronously if synchronous_commit=off, or if the current
+ * transaction has not performed any WAL-logged operation. The latter
+ * case can arise if the current transaction wrote only to temporary
+ * and/or unlogged tables. In case of a crash, the loss of such a
+ * transaction will be irrelevant since temp tables will be lost anyway,
+ * and unlogged tables will be truncated. (Given the foregoing, you might
+ * think that it would be unnecessary to emit the XLOG record at all in
+ * this case, but we don't currently try to do that. It would certainly
+ * cause problems at least in Hot Standby mode, where the
+ * KnownAssignedXids machinery requires tracking every XID assignment. It
+ * might be OK to skip it only when wal_level < hot_standby, but for now
+ * we don't.)
+ *
+ * However, if we're doing cleanup of any non-temp rels or committing any
+ * command that wanted to force sync commit, then we must flush XLOG
+ * immediately. (We must not allow asynchronous commit if there are any
+ * non-temp tables to be deleted, because we might delete the files before
+ * the COMMIT record is flushed to disk. We do allow asynchronous commit
+ * if all to-be-deleted tables are temporary though, since they are lost
+ * anyway if we crash.)
+ */
+ if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
+ forceSyncCommit || nrels > 0)
{
/*
* Synchronous commit case:
@@ -1213,12 +1253,19 @@ RecordTransactionCommit(void)
* fewer than CommitSiblings other backends with active transactions.
*/
if (CommitDelay > 0 && enableFsync &&
- CountActiveBackends() >= CommitSiblings)
+ MinimumActiveBackends(CommitSiblings))
pg_usleep(CommitDelay);
XLogFlush(XactLastRecEnd);
/*
+ * Wake up all walsenders to send WAL up to the COMMIT record
+ * immediately if replication is enabled
+ */
+ if (max_wal_senders > 0)
+ WalSndWakeup();
+
+ /*
* Now we may update the CLOG, if we wrote a COMMIT record above
*/
if (markXidCommitted)
@@ -1237,7 +1284,7 @@ RecordTransactionCommit(void)
* Report the latest async commit LSN, so that the WAL writer knows to
* flush this commit.
*/
- XLogSetAsyncCommitLSN(XactLastRecEnd);
+ XLogSetAsyncXactLSN(XactLastRecEnd);
/*
* We must not immediately update the CLOG, since we didn't flush the
@@ -1261,6 +1308,14 @@ RecordTransactionCommit(void)
/* Compute latestXid while we have the child XIDs handy */
latestXid = TransactionIdLatest(xid, nchildren, children);
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ SyncRepWaitForLSN(XactLastRecEnd);
+
/* Reset XactLastRecEnd until the next transaction writes something */
XactLastRecEnd.xrecoff = 0;
@@ -1484,7 +1539,7 @@ RecordTransactionAbort(bool isSubXact)
xid);
/* Fetch the data we need for the abort record */
- nrels = smgrGetPendingDeletes(false, &rels, NULL);
+ nrels = smgrGetPendingDeletes(false, &rels);
nchildren = xactGetCommittedChildren(&children);
/* XXX do we really need a critical section here? */
@@ -1540,7 +1595,7 @@ RecordTransactionAbort(bool isSubXact)
* problems occur at that point.
*/
if (!isSubXact)
- XLogSetAsyncCommitLSN(XactLastRecEnd);
+ XLogSetAsyncXactLSN(XactLastRecEnd);
/*
* Mark the transaction aborted in clog. This is not absolutely necessary
@@ -1794,6 +1849,7 @@ StartTransaction(void)
s->startedInRecovery = false;
XactReadOnly = DefaultXactReadOnly;
}
+ XactDeferrable = DefaultXactDeferrable;
XactIsoLevel = DefaultXactIsoLevel;
forceSyncCommit = false;
MyXactAccessedTempRel = false;
@@ -1957,12 +2013,10 @@ CommitTransaction(bool contact_gtm)
#endif
/*
- * Do pre-commit processing (most of this stuff requires database access,
- * and in fact could still cause an error...)
- *
- * It is possible for CommitHoldablePortals to invoke functions that queue
- * deferred triggers, and it's also possible that triggers create holdable
- * cursors. So we have to loop until there's nothing left to do.
+ * Do pre-commit processing that involves calling user-defined code, such
+ * as triggers. Since closing cursors could queue trigger actions,
+ * triggers could open cursors, etc, we have to keep looping until there's
+ * nothing left to do.
*/
#ifdef PGXC
if (!PrepareLocalCoord)
@@ -1976,21 +2030,25 @@ CommitTransaction(bool contact_gtm)
AfterTriggerFireDeferred();
/*
- * Convert any open holdable cursors into static portals. If there
- * weren't any, we are done ... otherwise loop back to check if they
- * queued deferred triggers. Lather, rinse, repeat.
+ * Close open portals (converting holdable ones into static portals).
+ * If there weren't any, we are done ... otherwise loop back to check
+ * if they queued deferred triggers. Lather, rinse, repeat.
*/
- if (!CommitHoldablePortals())
+ if (!PreCommit_Portals(false))
break;
else
IsHoldableCursor = true;
}
- /* Now we can shut down the deferred-trigger manager */
- AfterTriggerEndXact(true);
+ /*
+ * The remaining actions cannot call any user-defined code, so it's safe
+ * to start shutting down within-transaction services. But note that most
+ * of this stuff could still throw an error, which would switch us into
+ * the transaction-abort path.
+ */
- /* Close any open regular cursors */
- AtCommit_Portals();
+ /* Shut down the deferred-trigger manager */
+ AfterTriggerEndXact(true);
/*
* Let ON COMMIT management do its thing (must happen after closing
@@ -2002,6 +2060,13 @@ CommitTransaction(bool contact_gtm)
AtEOXact_LargeObject(true);
/*
+ * Mark serializable transaction as complete for predicate locking
+ * purposes. This should be done as late as we can put it and still allow
+ * errors to be raised for failure patterns found at commit.
+ */
+ PreCommit_CheckForSerializationFailure();
+
+ /*
* Insert notifications sent by NOTIFY commands into the queue. This
* should be late in the pre-commit sequence to minimize time spent
* holding the notify-insertion lock.
@@ -2268,12 +2333,10 @@ PrepareTransaction(void)
Assert(s->parent == NULL);
/*
- * Do pre-commit processing (most of this stuff requires database access,
- * and in fact could still cause an error...)
- *
- * It is possible for PrepareHoldablePortals to invoke functions that
- * queue deferred triggers, and it's also possible that triggers create
- * holdable cursors. So we have to loop until there's nothing left to do.
+ * Do pre-commit processing that involves calling user-defined code, such
+ * as triggers. Since closing cursors could queue trigger actions,
+ * triggers could open cursors, etc, we have to keep looping until there's
+ * nothing left to do.
*/
for (;;)
{
@@ -2283,19 +2346,23 @@ PrepareTransaction(void)
AfterTriggerFireDeferred();
/*
- * Convert any open holdable cursors into static portals. If there
- * weren't any, we are done ... otherwise loop back to check if they
- * queued deferred triggers. Lather, rinse, repeat.
+ * Close open portals (converting holdable ones into static portals).
+ * If there weren't any, we are done ... otherwise loop back to check
+ * if they queued deferred triggers. Lather, rinse, repeat.
*/
- if (!PrepareHoldablePortals())
+ if (!PreCommit_Portals(true))
break;
}
- /* Now we can shut down the deferred-trigger manager */
- AfterTriggerEndXact(true);
+ /*
+ * The remaining actions cannot call any user-defined code, so it's safe
+ * to start shutting down within-transaction services. But note that most
+ * of this stuff could still throw an error, which would switch us into
+ * the transaction-abort path.
+ */
- /* Close any open regular cursors */
- AtCommit_Portals();
+ /* Shut down the deferred-trigger manager */
+ AfterTriggerEndXact(true);
/*
* Let ON COMMIT management do its thing (must happen after closing
@@ -2306,6 +2373,13 @@ PrepareTransaction(void)
/* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
+ /*
+ * Mark serializable transaction as complete for predicate locking
+ * purposes. This should be done as late as we can put it and still allow
+ * errors to be raised for failure patterns found at commit.
+ */
+ PreCommit_CheckForSerializationFailure();
+
/* NOTIFY will be handled below */
/*
@@ -2370,6 +2444,7 @@ PrepareTransaction(void)
AtPrepare_Notify();
AtPrepare_Locks();
+ AtPrepare_PredicateLocks();
AtPrepare_PgStat();
AtPrepare_MultiXact();
AtPrepare_RelationMap();
@@ -2429,6 +2504,7 @@ PrepareTransaction(void)
PostPrepare_MultiXact(xid);
PostPrepare_Locks(xid);
+ PostPrepare_PredicateLocks(xid);
#ifdef PGXC
/*
@@ -5020,7 +5096,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn)
/* Make sure files supposed to be dropped are dropped */
for (i = 0; i < xlrec->nrels; i++)
{
- SMgrRelation srel = smgropen(xlrec->xnodes[i]);
+ SMgrRelation srel = smgropen(xlrec->xnodes[i], InvalidBackendId);
ForkNumber fork;
for (fork = 0; fork <= MAX_FORKNUM; fork++)
@@ -5028,7 +5104,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn)
if (smgrexists(srel, fork))
{
XLogDropRelation(xlrec->xnodes[i], fork);
- smgrdounlink(srel, fork, false, true);
+ smgrdounlink(srel, fork, true);
}
}
smgrclose(srel);
@@ -5069,9 +5145,9 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids);
- /* Make sure nextXid is beyond any XID mentioned in the record */
-
/*
+ * Make sure nextXid is beyond any XID mentioned in the record.
+ *
* We don't expect anyone else to modify nextXid, hence we don't need to
* hold a lock while checking this. We still acquire the lock to modify
* it, though.
@@ -5125,7 +5201,7 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
/* Make sure files supposed to be dropped are dropped */
for (i = 0; i < xlrec->nrels; i++)
{
- SMgrRelation srel = smgropen(xlrec->xnodes[i]);
+ SMgrRelation srel = smgropen(xlrec->xnodes[i], InvalidBackendId);
ForkNumber fork;
for (fork = 0; fork <= MAX_FORKNUM; fork++)
@@ -5133,7 +5209,7 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
if (smgrexists(srel, fork))
{
XLogDropRelation(xlrec->xnodes[i], fork);
- smgrdounlink(srel, fork, false, true);
+ smgrdounlink(srel, fork, true);
}
}
smgrclose(srel);
@@ -5207,7 +5283,7 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
appendStringInfo(buf, "; rels:");
for (i = 0; i < xlrec->nrels; i++)
{
- char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
+ char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM);
appendStringInfo(buf, " %s", path);
pfree(path);
@@ -5262,7 +5338,7 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
appendStringInfo(buf, "; rels:");
for (i = 0; i < xlrec->nrels; i++)
{
- char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
+ char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM);
appendStringInfo(buf, " %s", path);
pfree(path);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 71ee729c1a..11512e0048 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4,10 +4,10 @@
* PostgreSQL transaction log manager
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.430 2010/07/06 19:18:55 momjian Exp $
+ * src/backend/access/transam/xlog.c
*
*-------------------------------------------------------------------------
*/
@@ -49,8 +49,12 @@
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+#include "storage/latch.h"
#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
#include "storage/procarray.h"
+#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/builtins.h"
@@ -61,16 +65,15 @@
/* File path names (all relative to $PGDATA) */
-#define BACKUP_LABEL_FILE "backup_label"
-#define BACKUP_LABEL_OLD "backup_label.old"
#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
+#define PROMOTE_SIGNAL_FILE "promote"
/* User-settable parameters */
int CheckPointSegments = 3;
int wal_keep_segments = 0;
-int XLOGbuffers = 8;
+int XLOGbuffers = -1;
int XLogArchiveTimeout = 0;
bool XLogArchiveMode = false;
char *XLogArchiveCommand = NULL;
@@ -162,6 +165,12 @@ static XLogRecPtr LastRec;
static bool LocalRecoveryInProgress = true;
/*
+ * Local copy of SharedHotStandbyActive variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalHotStandbyActive = false;
+
+/*
* Local state for XLogInsertAllowed():
* 1: unconditionally allowed to insert XLOG
* 0: unconditionally not allowed to insert XLOG
@@ -185,18 +194,21 @@ static char *recoveryEndCommand = NULL;
static char *archiveCleanupCommand = NULL;
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
static bool recoveryTargetInclusive = true;
+static bool recoveryPauseAtTarget = true;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static char *recoveryTargetBarrierId;
+static char *recoveryTargetName;
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyMode = false;
static char *PrimaryConnInfo = NULL;
static char *TriggerFile = NULL;
-/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
+/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
static TransactionId recoveryStopXid;
static TimestampTz recoveryStopTime;
+static char recoveryStopName[MAXFNAMELEN];
static bool recoveryStopAfter;
/*
@@ -209,6 +221,8 @@ static bool recoveryStopAfter;
*
* recoveryTargetTLI: the desired timeline that we want to end in.
*
+ * recoveryTargetIsLatest: was the requested target timeline 'latest'?
+ *
* expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
* its known parents, newest first (so recoveryTargetTLI is always the
* first list member). Only these TLIs are expected to be seen in the WAL
@@ -222,6 +236,7 @@ static bool recoveryStopAfter;
* to decrease.
*/
static TimeLineID recoveryTargetTLI;
+static bool recoveryTargetIsLatest = false;
static List *expectedTLIs;
static TimeLineID curFileTLI;
@@ -341,6 +356,17 @@ typedef struct XLogCtlInsert
char *currpos; /* current insertion point in cache */
XLogRecPtr RedoRecPtr; /* current redo point for insertions */
bool forcePageWrites; /* forcing full-page writes for PITR? */
+
+ /*
+ * exclusiveBackup is true if a backup started with pg_start_backup() is
+ * in progress, and nonExclusiveBackups is a counter indicating the number
+ * of streaming base backups currently in progress. forcePageWrites is set
+ * to true when either of these is non-zero. lastBackupStart is the latest
+ * checkpoint redo location used as a starting point for an online backup.
+ */
+ bool exclusiveBackup;
+ int nonExclusiveBackups;
+ XLogRecPtr lastBackupStart;
} XLogCtlInsert;
/*
@@ -366,7 +392,7 @@ typedef struct XLogCtlData
XLogwrtResult LogwrtResult;
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
- XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */
+ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
uint32 lastRemovedLog; /* latest removed/recycled XLOG segment */
uint32 lastRemovedSeg;
@@ -397,6 +423,19 @@ typedef struct XLogCtlData
bool SharedRecoveryInProgress;
/*
+ * SharedHotStandbyActive indicates if we're still in crash or archive
+ * recovery. Protected by info_lck.
+ */
+ bool SharedHotStandbyActive;
+
+ /*
+ * recoveryWakeupLatch is used to wake up the startup process to continue
+ * WAL replay, if it is waiting for WAL to arrive or failover trigger file
+ * to appear.
+ */
+ Latch recoveryWakeupLatch;
+
+ /*
* During recovery, we keep a copy of the latest checkpoint record here.
* Used by the background writer when it wants to create a restartpoint.
*
@@ -411,6 +450,8 @@ typedef struct XLogCtlData
XLogRecPtr recoveryLastRecPtr;
/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
TimestampTz recoveryLastXTime;
+ /* Are we requested to pause recovery? */
+ bool recoveryPause;
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
@@ -536,11 +577,19 @@ typedef struct xl_parameter_change
int wal_level;
} xl_parameter_change;
+/* logs restore point */
+typedef struct xl_restore_point
+{
+ TimestampTz rp_time;
+ char rp_name[MAXFNAMELEN];
+} xl_restore_point;
+
/*
* Flags set by interrupt handlers for later service in the redo loop.
*/
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;
+static volatile sig_atomic_t promote_triggered = false;
/*
* Flag set when executing a restore command, to tell SIGTERM signal handler
@@ -558,6 +607,9 @@ static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI,
uint32 endLogId, uint32 endLogSeg);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static void recoveryPausesHere(void);
+static bool RecoveryIsPaused(void);
+static void SetRecoveryPause(bool recoveryPause);
static void SetLatestXTime(TimestampTz xtime);
static TimestampTz GetLatestXTime(void);
static void CheckRequiredParameterValues(void);
@@ -597,6 +649,7 @@ static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
static List *readTimeLineHistory(TimeLineID targetTLI);
static bool existsTimeLineHistory(TimeLineID probeTLI);
+static bool rescanLatestTimeLine(void);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
TimeLineID endTLI,
@@ -1878,18 +1931,18 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
}
/*
- * Record the LSN for an asynchronous transaction commit.
- * (This should not be called for aborts, nor for synchronous commits.)
+ * Record the LSN for an asynchronous transaction commit/abort.
+ * (This should not be called for for synchronous commits.)
*/
void
-XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
+XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
- if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
- xlogctl->asyncCommitLSN = asyncCommitLSN;
+ if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
+ xlogctl->asyncXactLSN = asyncXactLSN;
SpinLockRelease(&xlogctl->info_lck);
}
@@ -2138,7 +2191,7 @@ XLogBackgroundFlush(void)
volatile XLogCtlData *xlogctl = XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
- WriteRqstPtr = xlogctl->asyncCommitLSN;
+ WriteRqstPtr = xlogctl->asyncXactLSN;
SpinLockRelease(&xlogctl->info_lck);
flexible = false; /* ensure it all gets written */
}
@@ -3083,10 +3136,10 @@ not_available:
*
* 'command' is the shell command to be executed, 'commandName' is a
* human-readable name describing the command emitted in the logs. If
- * 'failonSignal' is true and the command is killed by a signal, a FATAL
+ * 'failOnSignal' is true and the command is killed by a signal, a FATAL
* error is thrown. Otherwise a WARNING is emitted.
*
- * This is currently used for restore_end_command and archive_cleanup_command.
+ * This is currently used for recovery_end_command and archive_cleanup_command.
*/
static void
ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
@@ -3215,7 +3268,7 @@ PreallocXlogFiles(XLogRecPtr endptr)
/*
* Get the log/seg of the latest removed or recycled WAL segment.
- * Returns 0 if no WAL segments have been removed since startup.
+ * Returns 0/0 if no WAL segments have been removed since startup.
*/
void
XLogGetLastRemoved(uint32 *log, uint32 *seg)
@@ -3276,9 +3329,6 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
#endif
struct stat statbuf;
- elog(DEBUG2, "removing WAL segments older than log file %u, segment %u",
- log, seg);
-
/*
* Initialize info about where to try to recycle to. We allow recycling
* segments up to XLOGfileslop segments beyond the current XLOG location.
@@ -3295,6 +3345,9 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
XLogFileName(lastoff, ThisTimeLineID, log, seg);
+ elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
+ lastoff);
+
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
/*
@@ -3447,7 +3500,7 @@ ValidateXLOGDirectoryStructure(void)
{
ereport(LOG,
(errmsg("creating missing WAL directory \"%s\"", path)));
- if (mkdir(path, 0700) < 0)
+ if (mkdir(path, S_IRWXU) < 0)
ereport(FATAL,
(errmsg("could not create missing directory \"%s\": %m",
path)));
@@ -3679,23 +3732,32 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
RecPtr = &tmpRecPtr;
/*
- * Align recptr to next page if no more records can fit on the current
- * page.
+ * RecPtr is pointing to end+1 of the previous WAL record. We must
+ * advance it if necessary to where the next record starts. First,
+ * align to next page if no more records can fit on the current page.
*/
if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
- {
- NextLogPage(tmpRecPtr);
- /* We will account for page header size below */
- }
+ NextLogPage(*RecPtr);
- if (tmpRecPtr.xrecoff >= XLogFileSize)
+ /* Check for crossing of xlog segment boundary */
+ if (RecPtr->xrecoff >= XLogFileSize)
{
- (tmpRecPtr.xlogid)++;
- tmpRecPtr.xrecoff = 0;
+ (RecPtr->xlogid)++;
+ RecPtr->xrecoff = 0;
}
+
+ /*
+ * If at page start, we must skip over the page header. But we can't
+ * do that until we've read in the page, since the header size is
+ * variable.
+ */
}
else
{
+ /*
+ * In this case, the passed-in record pointer should already be
+ * pointing to a valid record starting position.
+ */
if (!XRecOffIsValid(RecPtr->xrecoff))
ereport(PANIC,
(errmsg("invalid record offset at %X/%X",
@@ -3724,11 +3786,13 @@ retry:
if (targetRecOff == 0)
{
/*
- * Can only get here in the continuing-from-prev-page case, because
- * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
- * to skip over the new page's header.
+ * At page start, so skip over page header. The Assert checks that
+ * we're not scribbling on caller's record pointer; it's OK because we
+ * can only get here in the continuing-from-prev-record case, since
+ * XRecOffIsValid rejected the zero-page-offset case otherwise.
*/
- tmpRecPtr.xrecoff += pageHeaderSize;
+ Assert(RecPtr == &tmpRecPtr);
+ RecPtr->xrecoff += pageHeaderSize;
targetRecOff = pageHeaderSize;
}
else if (targetRecOff < pageHeaderSize)
@@ -4214,6 +4278,64 @@ existsTimeLineHistory(TimeLineID probeTLI)
}
/*
+ * Scan for new timelines that might have appeared in the archive since we
+ * started recovery.
+ *
+ * If there are any, the function changes recovery target TLI to the latest
+ * one and returns 'true'.
+ */
+static bool
+rescanLatestTimeLine(void)
+{
+ TimeLineID newtarget;
+
+ newtarget = findNewestTimeLine(recoveryTargetTLI);
+ if (newtarget != recoveryTargetTLI)
+ {
+ /*
+ * Determine the list of expected TLIs for the new TLI
+ */
+ List *newExpectedTLIs;
+
+ newExpectedTLIs = readTimeLineHistory(newtarget);
+
+ /*
+ * If the current timeline is not part of the history of the new
+ * timeline, we cannot proceed to it.
+ *
+ * XXX This isn't foolproof: The new timeline might have forked from
+ * the current one, but before the current recovery location. In that
+ * case we will still switch to the new timeline and proceed replaying
+ * from it even though the history doesn't match what we already
+ * replayed. That's not good. We will likely notice at the next online
+ * checkpoint, as the TLI won't match what we expected, but it's not
+ * guaranteed. The admin needs to make sure that doesn't happen.
+ */
+ if (!list_member_int(newExpectedTLIs,
+ (int) recoveryTargetTLI))
+ ereport(LOG,
+ (errmsg("new timeline %u is not a child of database system timeline %u",
+ newtarget,
+ ThisTimeLineID)));
+ else
+ {
+ /* Switch target */
+ recoveryTargetTLI = newtarget;
+ list_free(expectedTLIs);
+ expectedTLIs = newExpectedTLIs;
+
+ XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
+
+ ereport(LOG,
+ (errmsg("new target timeline is %u",
+ recoveryTargetTLI)));
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
* Find the newest existing timeline, assuming that startTLI exists.
*
* Note: while this is somewhat heuristic, it does positively guarantee
@@ -4365,7 +4487,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
xlogfname,
recoveryStopAfter ? "after" : "before",
recoveryStopXid);
- if (recoveryTarget == RECOVERY_TARGET_TIME)
+ else if (recoveryTarget == RECOVERY_TARGET_TIME)
snprintf(buffer, sizeof(buffer),
"%s%u\t%s\t%s %s\n",
(srcfd < 0) ? "" : "\n",
@@ -4383,6 +4505,13 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
recoveryStopAfter ? "after" : "before",
recoveryTargetBarrierId);
#endif
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ snprintf(buffer, sizeof(buffer),
+ "%s%u\t%s\tat restore point \"%s\"\n",
+ (srcfd < 0) ? "" : "\n",
+ parentTLI,
+ xlogfname,
+ recoveryStopName);
else
snprintf(buffer, sizeof(buffer),
"%s%u\t%s\tno recovery target specified\n",
@@ -4783,6 +4912,65 @@ GetSystemIdentifier(void)
}
/*
+ * Auto-tune the number of XLOG buffers.
+ *
+ * The preferred setting for wal_buffers is about 3% of shared_buffers, with
+ * a maximum of one XLOG segment (there is little reason to think that more
+ * is helpful, at least so long as we force an fsync when switching log files)
+ * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
+ * 9.1, when auto-tuning was added).
+ *
+ * This should not be called until NBuffers has received its final value.
+ */
+static int
+XLOGChooseNumBuffers(void)
+{
+ int xbuffers;
+
+ xbuffers = NBuffers / 32;
+ if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
+ xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
+ if (xbuffers < 8)
+ xbuffers = 8;
+ return xbuffers;
+}
+
+/*
+ * GUC check_hook for wal_buffers
+ */
+bool
+check_wal_buffers(int *newval, void **extra, GucSource source)
+{
+ /*
+ * -1 indicates a request for auto-tune.
+ */
+ if (*newval == -1)
+ {
+ /*
+ * If we haven't yet changed the boot_val default of -1, just let it
+ * be. We'll fix it when XLOGShmemSize is called.
+ */
+ if (XLOGbuffers == -1)
+ return true;
+
+ /* Otherwise, substitute the auto-tune value */
+ *newval = XLOGChooseNumBuffers();
+ }
+
+ /*
+ * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
+ * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
+ * the case, we just silently treat such values as a request for the
+ * minimum. (We could throw an error instead, but that doesn't seem very
+ * helpful.)
+ */
+ if (*newval < 4)
+ *newval = 4;
+
+ return true;
+}
+
+/*
* Initialization of shared memory for XLOG
*/
Size
@@ -4790,6 +4978,21 @@ XLOGShmemSize(void)
{
Size size;
+ /*
+ * If the value of wal_buffers is -1, use the preferred auto-tune value.
+ * This isn't an amazingly clean place to do this, but we must wait till
+ * NBuffers has received its final value, and must do it before using the
+ * value of XLOGbuffers to do anything important.
+ */
+ if (XLOGbuffers == -1)
+ {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
+ SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
+ }
+ Assert(XLOGbuffers > 0);
+
/* XLogCtl */
size = sizeof(XLogCtlData);
/* xlblocks array */
@@ -4852,8 +5055,10 @@ XLOGShmemInit(void)
*/
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->SharedRecoveryInProgress = true;
+ XLogCtl->SharedHotStandbyActive = false;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
+ InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
/*
* If we are not in bootstrap mode, pg_control should already exist. Read
@@ -4905,9 +5110,15 @@ BootStrapXLOG(void)
page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
memset(page, 0, XLOG_BLCKSZ);
- /* Set up information for the initial checkpoint record */
+ /*
+ * Set up information for the initial checkpoint record
+ *
+ * The initial checkpoint record is written to the beginning of the WAL
+ * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
+ * used, so that we can use 0/0 to mean "before any valid WAL segment".
+ */
checkPoint.redo.xlogid = 0;
- checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
+ checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
checkPoint.ThisTimeLineID = ThisTimeLineID;
checkPoint.nextXidEpoch = 0;
checkPoint.nextXid = FirstNormalTransactionId;
@@ -4930,7 +5141,7 @@ BootStrapXLOG(void)
page->xlp_info = XLP_LONG_HEADER;
page->xlp_tli = ThisTimeLineID;
page->xlp_pageaddr.xlogid = 0;
- page->xlp_pageaddr.xrecoff = 0;
+ page->xlp_pageaddr.xrecoff = XLogSegSize;
longpage = (XLogLongPageHeader) page;
longpage->xlp_sysid = sysidentifier;
longpage->xlp_seg_size = XLogSegSize;
@@ -4956,7 +5167,7 @@ BootStrapXLOG(void)
/* Create first XLOG segment file */
use_existent = false;
- openLogFile = XLogFileInit(0, 0, &use_existent, false);
+ openLogFile = XLogFileInit(0, 1, &use_existent, false);
/* Write the first page with the initial record */
errno = 0;
@@ -5023,115 +5234,20 @@ str_time(pg_time_t tnow)
}
/*
- * Parse one line from recovery.conf. 'cmdline' is the raw line from the
- * file. If the line is parsed successfully, returns true, false indicates
- * syntax error. On success, *key_p and *value_p are set to the parameter
- * name and value on the line, respectively. If the line is an empty line,
- * consisting entirely of whitespace and comments, function returns true
- * and *keyp_p and *value_p are set to NULL.
- *
- * The pointers returned in *key_p and *value_p point to an internal buffer
- * that is valid only until the next call of parseRecoveryCommandFile().
- */
-static bool
-parseRecoveryCommandFileLine(char *cmdline, char **key_p, char **value_p)
-{
- char *ptr;
- char *bufp;
- char *key;
- char *value;
- static char *buf = NULL;
-
- *key_p = *value_p = NULL;
-
- /*
- * Allocate the buffer on first use. It's used to hold both the parameter
- * name and value.
- */
- if (buf == NULL)
- buf = malloc(MAXPGPATH + 1);
- bufp = buf;
-
- /* Skip any whitespace at the beginning of line */
- for (ptr = cmdline; *ptr; ptr++)
- {
- if (!isspace((unsigned char) *ptr))
- break;
- }
- /* Ignore empty lines */
- if (*ptr == '\0' || *ptr == '#')
- return true;
-
- /* Read the parameter name */
- key = bufp;
- while (*ptr && !isspace((unsigned char) *ptr) &&
- *ptr != '=' && *ptr != '\'')
- *(bufp++) = *(ptr++);
- *(bufp++) = '\0';
-
- /* Skip to the beginning quote of the parameter value */
- ptr = strchr(ptr, '\'');
- if (!ptr)
- return false;
- ptr++;
-
- /* Read the parameter value to *bufp. Collapse any '' escapes as we go. */
- value = bufp;
- for (;;)
- {
- if (*ptr == '\'')
- {
- ptr++;
- if (*ptr == '\'')
- *(bufp++) = '\'';
- else
- {
- /* end of parameter */
- *bufp = '\0';
- break;
- }
- }
- else if (*ptr == '\0')
- return false; /* unterminated quoted string */
- else
- *(bufp++) = *ptr;
-
- ptr++;
- }
- *(bufp++) = '\0';
-
- /* Check that there's no garbage after the value */
- while (*ptr)
- {
- if (*ptr == '#')
- break;
- if (!isspace((unsigned char) *ptr))
- return false;
- ptr++;
- }
-
- /* Success! */
- *key_p = key;
- *value_p = value;
- return true;
-}
-
-/*
* See if there is a recovery command file (recovery.conf), and if so
* read in parameters for archive recovery and XLOG streaming.
*
- * XXX longer term intention is to expand this to
- * cater for additional parameters and controls
- * possibly use a flex lexer similar to the GUC one
+ * The file is parsed using the main configuration parser.
*/
static void
readRecoveryCommandFile(void)
{
FILE *fd;
- char cmdline[MAXPGPATH];
TimeLineID rtli = 0;
bool rtliGiven = false;
- bool syntaxError = false;
+ ConfigVariable *item,
+ *head = NULL,
+ *tail = NULL;
fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
if (fd == NULL)
@@ -5145,55 +5261,56 @@ readRecoveryCommandFile(void)
}
/*
- * Parse the file...
+ * Since we're asking ParseConfigFp() to error out at FATAL, there's no
+ * need to check the return value.
*/
- while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
- {
- char *tok1;
- char *tok2;
+ ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
- if (!parseRecoveryCommandFileLine(cmdline, &tok1, &tok2))
- {
- syntaxError = true;
- break;
- }
- if (tok1 == NULL)
- continue;
-
- if (strcmp(tok1, "restore_command") == 0)
+ for (item = head; item; item = item->next)
+ {
+ if (strcmp(item->name, "restore_command") == 0)
{
- recoveryRestoreCommand = pstrdup(tok2);
+ recoveryRestoreCommand = pstrdup(item->value);
ereport(DEBUG2,
(errmsg("restore_command = '%s'",
recoveryRestoreCommand)));
}
- else if (strcmp(tok1, "recovery_end_command") == 0)
+ else if (strcmp(item->name, "recovery_end_command") == 0)
{
- recoveryEndCommand = pstrdup(tok2);
+ recoveryEndCommand = pstrdup(item->value);
ereport(DEBUG2,
(errmsg("recovery_end_command = '%s'",
recoveryEndCommand)));
}
- else if (strcmp(tok1, "archive_cleanup_command") == 0)
+ else if (strcmp(item->name, "archive_cleanup_command") == 0)
{
- archiveCleanupCommand = pstrdup(tok2);
+ archiveCleanupCommand = pstrdup(item->value);
ereport(DEBUG2,
(errmsg("archive_cleanup_command = '%s'",
archiveCleanupCommand)));
}
- else if (strcmp(tok1, "recovery_target_timeline") == 0)
+ else if (strcmp(item->name, "pause_at_recovery_target") == 0)
+ {
+ if (!parse_bool(item->value, &recoveryPauseAtTarget))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
+ ereport(DEBUG2,
+ (errmsg("pause_at_recovery_target = '%s'", item->value)));
+ }
+ else if (strcmp(item->name, "recovery_target_timeline") == 0)
{
rtliGiven = true;
- if (strcmp(tok2, "latest") == 0)
+ if (strcmp(item->value, "latest") == 0)
rtli = 0;
else
{
errno = 0;
- rtli = (TimeLineID) strtoul(tok2, NULL, 0);
+ rtli = (TimeLineID) strtoul(item->value, NULL, 0);
if (errno == EINVAL || errno == ERANGE)
ereport(FATAL,
(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
- tok2)));
+ item->value)));
}
if (rtli)
ereport(DEBUG2,
@@ -5202,26 +5319,27 @@ readRecoveryCommandFile(void)
ereport(DEBUG2,
(errmsg("recovery_target_timeline = latest")));
}
- else if (strcmp(tok1, "recovery_target_xid") == 0)
+ else if (strcmp(item->name, "recovery_target_xid") == 0)
{
errno = 0;
- recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
+ recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
if (errno == EINVAL || errno == ERANGE)
ereport(FATAL,
(errmsg("recovery_target_xid is not a valid number: \"%s\"",
- tok2)));
+ item->value)));
ereport(DEBUG2,
(errmsg("recovery_target_xid = %u",
recoveryTargetXid)));
recoveryTarget = RECOVERY_TARGET_XID;
}
- else if (strcmp(tok1, "recovery_target_time") == 0)
+ else if (strcmp(item->name, "recovery_target_time") == 0)
{
/*
- * if recovery_target_xid specified, then this overrides
- * recovery_target_time
+ * if recovery_target_xid or recovery_target_name specified, then
+ * this overrides recovery_target_time
*/
- if (recoveryTarget == RECOVERY_TARGET_XID)
+ if (recoveryTarget == RECOVERY_TARGET_XID ||
+ recoveryTarget == RECOVERY_TARGET_NAME)
continue;
recoveryTarget = RECOVERY_TARGET_TIME;
@@ -5230,7 +5348,7 @@ readRecoveryCommandFile(void)
*/
recoveryTargetTime =
DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
- CStringGetDatum(tok2),
+ CStringGetDatum(item->value),
ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1)));
ereport(DEBUG2,
@@ -5238,43 +5356,63 @@ readRecoveryCommandFile(void)
timestamptz_to_str(recoveryTargetTime))));
}
#ifdef PGXC
- else if (strcmp(tok1, "recovery_target_barrier") == 0)
+ else if (strcmp(item->name, "recovery_target_barrier") == 0)
{
recoveryTarget = RECOVERY_TARGET_BARRIER;
- recoveryTargetBarrierId = pstrdup(tok2);
+ recoveryTargetBarrierId = pstrdup(item->value);
}
#endif
- else if (strcmp(tok1, "recovery_target_inclusive") == 0)
+ else if (strcmp(item->name, "recovery_target_name") == 0)
+ {
+ /*
+ * if recovery_target_xid specified, then this overrides
+ * recovery_target_name
+ */
+ if (recoveryTarget == RECOVERY_TARGET_XID)
+ continue;
+ recoveryTarget = RECOVERY_TARGET_NAME;
+
+ recoveryTargetName = pstrdup(item->value);
+ if (strlen(recoveryTargetName) >= MAXFNAMELEN)
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery_target_name is too long (maximum %d characters)", MAXFNAMELEN - 1)));
+
+ ereport(DEBUG2,
+ (errmsg("recovery_target_name = '%s'",
+ recoveryTargetName)));
+ }
+ else if (strcmp(item->name, "recovery_target_inclusive") == 0)
{
/*
* does nothing if a recovery_target is not also set
*/
- if (!parse_bool(tok2, &recoveryTargetInclusive))
+ if (!parse_bool(item->value, &recoveryTargetInclusive))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
+ errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
ereport(DEBUG2,
- (errmsg("recovery_target_inclusive = %s", tok2)));
+ (errmsg("recovery_target_inclusive = %s", item->value)));
}
- else if (strcmp(tok1, "standby_mode") == 0)
+ else if (strcmp(item->name, "standby_mode") == 0)
{
- if (!parse_bool(tok2, &StandbyMode))
+ if (!parse_bool(item->value, &StandbyMode))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"standby_mode\" requires a Boolean value")));
+ errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
ereport(DEBUG2,
- (errmsg("standby_mode = '%s'", tok2)));
+ (errmsg("standby_mode = '%s'", item->value)));
}
- else if (strcmp(tok1, "primary_conninfo") == 0)
+ else if (strcmp(item->name, "primary_conninfo") == 0)
{
- PrimaryConnInfo = pstrdup(tok2);
+ PrimaryConnInfo = pstrdup(item->value);
ereport(DEBUG2,
(errmsg("primary_conninfo = '%s'",
PrimaryConnInfo)));
}
- else if (strcmp(tok1, "trigger_file") == 0)
+ else if (strcmp(item->name, "trigger_file") == 0)
{
- TriggerFile = pstrdup(tok2);
+ TriggerFile = pstrdup(item->value);
ereport(DEBUG2,
(errmsg("trigger_file = '%s'",
TriggerFile)));
@@ -5282,17 +5420,9 @@ readRecoveryCommandFile(void)
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
- tok1)));
+ item->name)));
}
- FreeFile(fd);
-
- if (syntaxError)
- ereport(FATAL,
- (errmsg("syntax error in recovery command file: %s",
- cmdline),
- errhint("Lines should have the format parameter = 'value'.")));
-
/*
* Check for compulsory parameters
*/
@@ -5331,13 +5461,18 @@ readRecoveryCommandFile(void)
(errmsg("recovery target timeline %u does not exist",
rtli)));
recoveryTargetTLI = rtli;
+ recoveryTargetIsLatest = false;
}
else
{
/* We start the "latest" search from pg_control's timeline */
recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
+ recoveryTargetIsLatest = true;
}
}
+
+ FreeConfigVariables(head);
+ FreeFile(fd);
}
/*
@@ -5463,8 +5598,8 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
* Returns TRUE if we are stopping, FALSE otherwise. On TRUE return,
* *includeThis is set TRUE if we should apply this record before stopping.
*
- * We also track the timestamp of the latest applied COMMIT/ABORT record
- * in XLogCtl->recoveryLastXTime, for logging purposes.
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogCtl->recoveryLastXTime, for logging purposes.
* Also, some information is saved in recoveryStopXid et al for use in
* annotating the new timeline's history file.
*/
@@ -5478,13 +5613,16 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
#endif
uint8 record_info;
TimestampTz recordXtime;
+ char recordRPName[MAXFNAMELEN];
#ifdef PGXC
/* We only consider stoppping at COMMIT, ABORT or BARRIER records */
- if ((record->xl_rmid != RM_XACT_ID) && (record->xl_rmid != RM_BARRIER_ID))
+ if (record->xl_rmid != RM_XACT_ID &&
+ record->xl_rmid != RM_BARRIER_ID &&
+ record->xl_rmid != RM_XLOG_ID)
#else
- /* We only consider stopping at COMMIT or ABORT records */
- if (record->xl_rmid != RM_XACT_ID)
+ /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
+ if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
#endif
return false;
@@ -5493,14 +5631,14 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
if (record->xl_rmid == RM_XACT_ID)
{
#endif
- if (record_info == XLOG_XACT_COMMIT)
+ if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
{
xl_xact_commit *recordXactCommitData;
recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
recordXtime = recordXactCommitData->xact_time;
}
- else if (record_info == XLOG_XACT_ABORT)
+ else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
{
xl_xact_abort *recordXactAbortData;
@@ -5519,20 +5657,33 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
}
}
#endif
+ else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *recordRestorePointData;
+
+ recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+ recordXtime = recordRestorePointData->rp_time;
+ strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
+ }
else
return false;
/* Do we have a PITR target at all? */
if (recoveryTarget == RECOVERY_TARGET_UNSET)
{
- SetLatestXTime(recordXtime);
+ /*
+ * Save timestamp of latest transaction commit/abort if this is a
+ * transaction record
+ */
+ if (record->xl_rmid == RM_XACT_ID)
+ SetLatestXTime(recordXtime);
return false;
}
if (recoveryTarget == RECOVERY_TARGET_XID)
{
/*
- * there can be only one transaction end record with this exact
+ * There can be only one transaction end record with this exact
* transactionid
*
* when testing for an xid, we MUST test for equality only, since
@@ -5559,10 +5710,24 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
}
}
#endif
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ {
+ /*
+ * There can be many restore points that share the same name, so we
+ * stop at the first one
+ */
+ stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
+
+ /*
+ * Ignore recoveryTargetInclusive because this is not a transaction
+ * record
+ */
+ *includeThis = false;
+ }
else
{
/*
- * there can be many transactions that share the same commit time, so
+ * There can be many transactions that share the same commit time, so
* we stop after the last one, if we are inclusive, or stop at the
* first one if we are exclusive
*/
@@ -5593,7 +5758,7 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recoveryStopXid,
timestamptz_to_str(recoveryStopTime))));
}
- else
+ else if (record_info == XLOG_XACT_ABORT)
{
if (recoveryStopAfter)
ereport(LOG,
@@ -5606,8 +5771,22 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recoveryStopXid,
timestamptz_to_str(recoveryStopTime))));
}
+ else
+ {
+ strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
- if (recoveryStopAfter)
+ ereport(LOG,
+ (errmsg("recovery stopping at restore point \"%s\", time %s",
+ recoveryStopName,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+
+ /*
+ * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
+ * restore point since they are timestamped, though the latest
+ * transaction time is not updated.
+ */
+ if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
SetLatestXTime(recordXtime);
}
#ifdef PGXC
@@ -5621,13 +5800,121 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
return true;
}
#endif
- else
+ else if (record->xl_rmid == RM_XACT_ID)
SetLatestXTime(recordXtime);
return stopsHere;
}
/*
+ * Recheck shared recoveryPause by polling.
+ *
+ * XXX Can also be done with shared latch.
+ */
+static void
+recoveryPausesHere(void)
+{
+ ereport(LOG,
+ (errmsg("recovery has paused"),
+ errhint("Execute pg_xlog_replay_resume() to continue.")));
+
+ while (RecoveryIsPaused())
+ {
+ pg_usleep(1000000L); /* 1000 ms */
+ HandleStartupProcInterrupts();
+ }
+}
+
+static bool
+RecoveryIsPaused(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ bool recoveryPause;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ recoveryPause = xlogctl->recoveryPause;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return recoveryPause;
+}
+
+static void
+SetRecoveryPause(bool recoveryPause)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->recoveryPause = recoveryPause;
+ SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * pg_xlog_replay_pause - pause recovery now
+ */
+Datum
+pg_xlog_replay_pause(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to control recovery"))));
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ SetRecoveryPause(true);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * pg_xlog_replay_resume - resume recovery now
+ */
+Datum
+pg_xlog_replay_resume(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to control recovery"))));
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ SetRecoveryPause(false);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * pg_is_xlog_replay_paused
+ */
+Datum
+pg_is_xlog_replay_paused(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to control recovery"))));
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ PG_RETURN_BOOL(RecoveryIsPaused());
+}
+
+/*
* Save timestamp of latest processed commit/abort record.
*
* We keep this in XLogCtl, not a simple static variable, so that it can be
@@ -5663,6 +5950,24 @@ GetLatestXTime(void)
}
/*
+ * Returns timestamp of latest processed commit/abort record.
+ *
+ * When the server has been started normally without recovery the function
+ * returns NULL.
+ */
+Datum
+pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
+{
+ TimestampTz xtime;
+
+ xtime = GetLatestXTime();
+ if (xtime == 0)
+ PG_RETURN_NULL();
+
+ PG_RETURN_TIMESTAMPTZ(xtime);
+}
+
+/*
* Returns bool with current recovery mode, a global state.
*/
Datum
@@ -5888,11 +6193,22 @@ StartupXLOG(void)
(errmsg("starting point-in-time recovery to barrier %s",
(recoveryTargetBarrierId))));
#endif
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to \"%s\"",
+ recoveryTargetName)));
else
ereport(LOG,
(errmsg("starting archive recovery")));
}
+ /*
+ * Take ownership of the wakeup latch if we're going to sleep during
+ * recovery.
+ */
+ if (StandbyMode)
+ OwnLatch(&XLogCtl->recoveryWakeupLatch);
+
if (read_backup_label(&checkPointLoc))
{
/*
@@ -5902,16 +6218,33 @@ StartupXLOG(void)
record = ReadCheckpointRecord(checkPointLoc, 0);
if (record != NULL)
{
+ memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+ wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X",
checkPointLoc.xlogid, checkPointLoc.xrecoff)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
+
+ /*
+ * Make sure that REDO location exists. This may not be the case
+ * if there was a crash during an online backup, which left a
+ * backup_label around that references a WAL segment that's
+ * already been archived.
+ */
+ if (XLByteLT(checkPoint.redo, checkPointLoc))
+ {
+ if (!ReadRecord(&(checkPoint.redo), LOG, false))
+ ereport(FATAL,
+ (errmsg("could not find redo location referenced by checkpoint record"),
+ errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
+ }
}
else
{
- ereport(PANIC,
+ ereport(FATAL,
(errmsg("could not locate required checkpoint record"),
errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
+ wasShutdown = false; /* keep compiler quiet */
}
/* set flag to delete it later */
haveBackupLabel = true;
@@ -5955,11 +6288,11 @@ StartupXLOG(void)
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
+ memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+ wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
}
LastRec = RecPtr = checkPointLoc;
- memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
- wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
(errmsg("redo record is at %X/%X; shutdown %s",
@@ -6052,8 +6385,7 @@ StartupXLOG(void)
}
/*
- * set backupStartupPoint if we're starting archive recovery from a
- * base backup
+ * set backupStartPoint if we're starting recovery from a base backup
*/
if (haveBackupLabel)
ControlFile->backupStartPoint = checkPoint.redo;
@@ -6091,6 +6423,14 @@ StartupXLOG(void)
CheckRequiredParameterValues();
/*
+ * We're in recovery, so unlogged relations relations may be trashed
+ * and must be reset. This should be done BEFORE allowing Hot Standby
+ * connections, so that read-only backends don't try to read whatever
+ * garbage is left over from before.
+ */
+ ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
+
+ /*
* Initialize for Hot Standby, if enabled. We won't let backends in
* yet, not until we've reached the min recovery point specified in
* control file and we've established a recovery snapshot from a
@@ -6117,8 +6457,6 @@ StartupXLOG(void)
StartupSUBTRANS(oldestActiveXID);
StartupMultiXact();
- ProcArrayInitRecoveryInfo(oldestActiveXID);
-
/*
* If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the master at this point. So fake-up an
@@ -6175,6 +6513,7 @@ StartupXLOG(void)
xlogctl->replayEndRecPtr = ReadRecPtr;
xlogctl->recoveryLastRecPtr = ReadRecPtr;
xlogctl->recoveryLastXTime = 0;
+ xlogctl->recoveryPause = false;
SpinLockRelease(&xlogctl->info_lck);
/* Also ensure XLogReceiptTime has a sane value */
@@ -6193,6 +6532,7 @@ StartupXLOG(void)
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
+ PublishStartupProcessInformation();
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
@@ -6223,6 +6563,7 @@ StartupXLOG(void)
{
bool recoveryContinue = true;
bool recoveryApply = true;
+ bool recoveryPause = false;
ErrorContextCallback errcontext;
TimestampTz xtime;
@@ -6269,6 +6610,15 @@ StartupXLOG(void)
*/
if (recoveryStopsHere(record, &recoveryApply))
{
+ /*
+ * Pause only if users can connect to send a resume
+ * message
+ */
+ if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY)
+ {
+ SetRecoveryPause(true);
+ recoveryPausesHere();
+ }
reachedStopPoint = true; /* see below */
recoveryContinue = false;
if (!recoveryApply)
@@ -6295,9 +6645,16 @@ StartupXLOG(void)
*/
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = EndRecPtr;
+ recoveryPause = xlogctl->recoveryPause;
SpinLockRelease(&xlogctl->info_lck);
/*
+ * Pause only if users can connect to send a resume message
+ */
+ if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY)
+ recoveryPausesHere();
+
+ /*
* If we are attempting to enter Hot Standby mode, process
* XIDs we see
*/
@@ -6346,12 +6703,18 @@ StartupXLOG(void)
}
/*
- * If we launched a WAL receiver, it should be gone by now. It will trump
- * over the startup checkpoint and subsequent records if it's still alive,
- * so be extra sure that it's gone.
+ * Kill WAL receiver, if it's still running, before we continue to write
+ * the startup checkpoint record. It will trump over the checkpoint and
+ * subsequent records if it's still alive when we start writing WAL.
+ */
+ ShutdownWalRcv();
+
+ /*
+ * We don't need the latch anymore. It's not strictly necessary to disown
+ * it, but let's do it for the sake of tidiness.
*/
- if (WalRcvInProgress())
- elog(PANIC, "wal receiver still active");
+ if (StandbyMode)
+ DisownLatch(&XLogCtl->recoveryWakeupLatch);
/*
* We are now done reading the xlog from stream. Turn off streaming
@@ -6375,16 +6738,36 @@ StartupXLOG(void)
* be further ahead --- ControlFile->minRecoveryPoint cannot have been
* advanced beyond the WAL we processed.
*/
- if (InArchiveRecovery &&
+ if (InRecovery &&
(XLByteLT(EndOfLog, minRecoveryPoint) ||
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
{
- if (reachedStopPoint) /* stopped because of stop request */
+ if (reachedStopPoint)
+ {
+ /* stopped because of stop request */
ereport(FATAL,
(errmsg("requested recovery stop point is before consistent recovery point")));
- else /* ran off end of WAL */
- ereport(FATAL,
- (errmsg("WAL ends before consistent recovery point")));
+ }
+
+ /*
+ * Ran off end of WAL before reaching end-of-backup WAL record, or
+ * minRecoveryPoint. That's usually a bad sign, indicating that you
+ * tried to recover from an online backup but never called
+ * pg_stop_backup(), or you didn't archive all the WAL up to that
+ * point. However, this also happens in crash recovery, if the system
+ * crashes while an online backup is in progress. We must not treat
+ * that as an error, or the database will refuse to start up.
+ */
+ if (InArchiveRecovery)
+ {
+ if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+ ereport(FATAL,
+ (errmsg("WAL ends before end of online backup"),
+ errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
+ else
+ ereport(FATAL,
+ (errmsg("WAL ends before consistent recovery point")));
+ }
}
/*
@@ -6539,6 +6922,14 @@ StartupXLOG(void)
PreallocXlogFiles(EndOfLog);
/*
+ * Reset initial contents of unlogged relations. This has to be done
+ * AFTER recovery is complete so that any unlogged relations created
+ * during recovery also get picked up.
+ */
+ if (InRecovery)
+ ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
+
+ /*
* Okay, we're officially UP.
*/
InRecovery = false;
@@ -6630,8 +7021,6 @@ StartupXLOG(void)
static void
CheckRecoveryConsistency(void)
{
- static bool backendsAllowed = false;
-
/*
* Have we passed our safe starting point?
*/
@@ -6651,11 +7040,19 @@ CheckRecoveryConsistency(void)
* enabling connections.
*/
if (standbyState == STANDBY_SNAPSHOT_READY &&
- !backendsAllowed &&
+ !LocalHotStandbyActive &&
reachedMinRecoveryPoint &&
IsUnderPostmaster)
{
- backendsAllowed = true;
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->SharedHotStandbyActive = true;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ LocalHotStandbyActive = true;
+
SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
}
}
@@ -6703,6 +7100,38 @@ RecoveryInProgress(void)
}
/*
+ * Is HotStandby active yet? This is only important in special backends
+ * since normal backends won't ever be able to connect until this returns
+ * true. Postmaster knows this by way of signal, not via shared memory.
+ *
+ * Unlike testing standbyState, this works in any process that's connected to
+ * shared memory.
+ */
+bool
+HotStandbyActive(void)
+{
+ /*
+ * We check shared state each time only until Hot Standby is active. We
+ * can't de-activate Hot Standby, so there's no need to keep checking
+ * after the shared variable has once been seen true.
+ */
+ if (LocalHotStandbyActive)
+ return true;
+ else
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ /* spinlock is essential on machines with weak memory ordering! */
+ SpinLockAcquire(&xlogctl->info_lck);
+ LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return LocalHotStandbyActive;
+ }
+}
+
+/*
* Is this process allowed to insert new WAL records?
*
* Ordinarily this is essentially equivalent to !RecoveryInProgress().
@@ -7078,10 +7507,15 @@ LogCheckpointEnd(bool restartpoint)
{
long write_secs,
sync_secs,
- total_secs;
+ total_secs,
+ longest_secs,
+ average_secs;
int write_usecs,
sync_usecs,
- total_usecs;
+ total_usecs,
+ longest_usecs,
+ average_usecs;
+ uint64 average_sync_time;
CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
@@ -7097,18 +7531,43 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
+ /*
+ * Timing values returned from CheckpointStats are in microseconds.
+ * Convert to the second plus microsecond form that TimestampDifference
+ * returns for homogeneous printing.
+ */
+ longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
+ longest_usecs = CheckpointStats.ckpt_longest_sync -
+ (uint64) longest_secs *1000000;
+
+ average_sync_time = 0;
+ if (CheckpointStats.ckpt_sync_rels > 0)
+ average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+ CheckpointStats.ckpt_sync_rels;
+ average_secs = (long) (average_sync_time / 1000000);
+ average_usecs = average_sync_time - (uint64) average_secs *1000000;
+
if (restartpoint)
elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
- "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ "%d transaction log file(s) added, %d removed, %d recycled; "
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+ "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+ CheckpointStats.ckpt_segs_added,
+ CheckpointStats.ckpt_segs_removed,
+ CheckpointStats.ckpt_segs_recycled,
write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000,
- total_secs, total_usecs / 1000);
+ total_secs, total_usecs / 1000,
+ CheckpointStats.ckpt_sync_rels,
+ longest_secs, longest_usecs / 1000,
+ average_secs, average_usecs / 1000);
else
elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
- "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+ "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_segs_added,
@@ -7116,7 +7575,10 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_segs_recycled,
write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000,
- total_secs, total_usecs / 1000);
+ total_secs, total_usecs / 1000,
+ CheckpointStats.ckpt_sync_rels,
+ longest_secs, longest_usecs / 1000,
+ average_secs, average_usecs / 1000);
}
/*
@@ -7570,6 +8032,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
CheckPointCLOG();
CheckPointSUBTRANS();
CheckPointMultiXact();
+ CheckPointPredicate();
CheckPointRelationMap();
CheckPointBuffers(flags); /* performs all required fsyncs */
/* We deliberately delay 2PC checkpointing as long as possible */
@@ -7721,16 +8184,18 @@ CreateRestartPoint(int flags)
SpinLockRelease(&xlogctl->info_lck);
LWLockRelease(WALInsertLock);
- if (log_checkpoints)
- {
- /*
- * Prepare to accumulate statistics.
- */
- MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
- CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ /*
+ * Prepare to accumulate statistics.
+ *
+ * Note: because it is possible for log_checkpoints to change while a
+ * checkpoint proceeds, we always accumulate stats, even if
+ * log_checkpoints is currently off.
+ */
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ if (log_checkpoints)
LogCheckpointStart(flags, true);
- }
CheckPointGuts(lastCheckPoint.redo, flags);
@@ -7785,10 +8250,14 @@ CreateRestartPoint(int flags)
}
/*
- * Currently, there is no need to truncate pg_subtrans during recovery. If
- * we did do that, we will need to have called StartupSUBTRANS() already
- * and then TruncateSUBTRANS() would go here.
+ * Truncate pg_subtrans if possible. We can throw away all data before
+ * the oldest XMIN of any running transaction. No future transaction will
+ * attempt to reference any pg_subtrans entry older than that (see Asserts
+ * in subtrans.c). When hot standby is disabled, though, we mustn't do
+ * this because StartupSUBTRANS hasn't been called yet.
*/
+ if (EnableHotStandby)
+ TruncateSUBTRANS(GetOldestXmin(true, false));
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
@@ -7876,6 +8345,33 @@ RequestXLogSwitch(void)
}
/*
+ * Write a RESTORE POINT record
+ */
+XLogRecPtr
+XLogRestorePoint(const char *rpName)
+{
+ XLogRecPtr RecPtr;
+ XLogRecData rdata;
+ xl_restore_point xlrec;
+
+ xlrec.rp_time = GetCurrentTimestamp();
+ strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) &xlrec;
+ rdata.len = sizeof(xl_restore_point);
+ rdata.next = NULL;
+
+ RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
+
+ ereport(LOG,
+ (errmsg("restore point \"%s\" created at %X/%X",
+ rpName, RecPtr.xlogid, RecPtr.xrecoff)));
+
+ return RecPtr;
+}
+
+/*
* Check if any of the GUC parameters that are critical for hot standby
* have changed, and update the value in pg_control file if necessary.
*/
@@ -8067,6 +8563,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
/* nothing to do here */
}
+ else if (info == XLOG_RESTORE_POINT)
+ {
+ /* nothing to do here */
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
@@ -8169,6 +8669,13 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
{
appendStringInfo(buf, "xlog switch");
}
+ else if (info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *xlrec = (xl_restore_point *) rec;
+
+ appendStringInfo(buf, "restore point: %s", xlrec->rp_name);
+
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
@@ -8246,7 +8753,7 @@ get_sync_bit(int method)
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
- * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
+ * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
* disabled, otherwise the archive command or walsender process will read
* the WAL soon after writing it, which is guaranteed to cause a physical
* read if we bypassed the kernel cache. We also skip the
@@ -8291,12 +8798,9 @@ get_sync_bit(int method)
/*
* GUC support
*/
-bool
-assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
+void
+assign_xlog_sync_method(int new_sync_method, void *extra)
{
- if (!doit)
- return true;
-
if (sync_method != new_sync_method)
{
/*
@@ -8316,8 +8820,6 @@ assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
XLogFileClose();
}
}
-
- return true;
}
@@ -8383,6 +8885,43 @@ pg_start_backup(PG_FUNCTION_ARGS)
text *backupid = PG_GETARG_TEXT_P(0);
bool fast = PG_GETARG_BOOL(1);
char *backupidstr;
+ XLogRecPtr startpoint;
+ char startxlogstr[MAXFNAMELEN];
+
+ backupidstr = text_to_cstring(backupid);
+
+ startpoint = do_pg_start_backup(backupidstr, fast, NULL);
+
+ snprintf(startxlogstr, sizeof(startxlogstr), "%X/%X",
+ startpoint.xlogid, startpoint.xrecoff);
+ PG_RETURN_TEXT_P(cstring_to_text(startxlogstr));
+}
+
+/*
+ * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
+ * function. It creates the necessary starting checkpoint and constructs the
+ * backup label file.
+ *
+ * There are two kind of backups: exclusive and non-exclusive. An exclusive
+ * backup is started with pg_start_backup(), and there can be only one active
+ * at a time. The backup label file of an exclusive backup is written to
+ * $PGDATA/backup_label, and it is removed by pg_stop_backup().
+ *
+ * A non-exclusive backup is used for the streaming base backups (see
+ * src/backend/replication/basebackup.c). The difference to exclusive backups
+ * is that the backup label file is not written to disk. Instead, its would-be
+ * contents are returned in *labelfile, and the caller is responsible for
+ * including it in the backup archive as 'backup_label'. There can be many
+ * non-exclusive backups active at the same time, and they don't conflict
+ * with an exclusive backup either.
+ *
+ * Every successfully started non-exclusive backup must be stopped by calling
+ * do_pg_stop_backup() or do_pg_abort_backup().
+ */
+XLogRecPtr
+do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
+{
+ bool exclusive = (labelfile == NULL);
XLogRecPtr checkpointloc;
XLogRecPtr startpoint;
pg_time_t stamp_time;
@@ -8392,11 +8931,12 @@ pg_start_backup(PG_FUNCTION_ARGS)
uint32 _logSeg;
struct stat stat_buf;
FILE *fp;
+ StringInfoData labelfbuf;
- if (!superuser())
+ if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
- errmsg("must be superuser to run a backup")));
+ errmsg("must be superuser or replication role to run a backup")));
if (RecoveryInProgress())
ereport(ERROR,
@@ -8410,7 +8950,24 @@ pg_start_backup(PG_FUNCTION_ARGS)
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
- backupidstr = text_to_cstring(backupid);
+ if (strlen(backupidstr) > MAXPGPATH)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("backup label too long (max %d bytes)",
+ MAXPGPATH)));
+
+ /*
+ * Force an XLOG file switch before the checkpoint, to ensure that the WAL
+ * segment the checkpoint is written to doesn't contain pages with old
+ * timeline IDs. That would otherwise happen if you called
+ * pg_start_backup() right after restoring from a PITR archive: the first
+ * WAL segment containing the startup checkpoint has pages in the
+ * beginning with the old timeline ID. That can cause trouble at recovery:
+ * we won't have a history file covering the old timeline if pg_xlog
+ * directory was not included in the base backup and the WAL archive was
+ * cleared too before starting the backup.
+ */
+ RequestXLogSwitch();
/*
* Mark backup active in shared memory. We must do full-page WAL writes
@@ -8430,122 +8987,168 @@ pg_start_backup(PG_FUNCTION_ARGS)
* ensure adequate interlocking against XLogInsert().
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
- if (XLogCtl->Insert.forcePageWrites)
+ if (exclusive)
{
- LWLockRelease(WALInsertLock);
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("a backup is already in progress"),
- errhint("Run pg_stop_backup() and try again.")));
+ if (XLogCtl->Insert.exclusiveBackup)
+ {
+ LWLockRelease(WALInsertLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is already in progress"),
+ errhint("Run pg_stop_backup() and try again.")));
+ }
+ XLogCtl->Insert.exclusiveBackup = true;
}
+ else
+ XLogCtl->Insert.nonExclusiveBackups++;
XLogCtl->Insert.forcePageWrites = true;
LWLockRelease(WALInsertLock);
- /*
- * Force an XLOG file switch before the checkpoint, to ensure that the WAL
- * segment the checkpoint is written to doesn't contain pages with old
- * timeline IDs. That would otherwise happen if you called
- * pg_start_backup() right after restoring from a PITR archive: the first
- * WAL segment containing the startup checkpoint has pages in the
- * beginning with the old timeline ID. That can cause trouble at recovery:
- * we won't have a history file covering the old timeline if pg_xlog
- * directory was not included in the base backup and the WAL archive was
- * cleared too before starting the backup.
- */
- RequestXLogSwitch();
-
/* Ensure we release forcePageWrites if fail below */
- PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
+ PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
{
- /*
- * Force a CHECKPOINT. Aside from being necessary to prevent torn
- * page problems, this guarantees that two successive backup runs will
- * have different checkpoint positions and hence different history
- * file names, even if nothing happened in between.
- *
- * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
- * fast = true). Otherwise this can take awhile.
- */
- RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
- (fast ? CHECKPOINT_IMMEDIATE : 0));
+ bool gotUniqueStartpoint = false;
- /*
- * Now we need to fetch the checkpoint record location, and also its
- * REDO pointer. The oldest point in WAL that would be needed to
- * restore starting from the checkpoint is precisely the REDO pointer.
- */
- LWLockAcquire(ControlFileLock, LW_SHARED);
- checkpointloc = ControlFile->checkPoint;
- startpoint = ControlFile->checkPointCopy.redo;
- LWLockRelease(ControlFileLock);
+ do
+ {
+ /*
+ * Force a CHECKPOINT. Aside from being necessary to prevent torn
+ * page problems, this guarantees that two successive backup runs
+ * will have different checkpoint positions and hence different
+ * history file names, even if nothing happened in between.
+ *
+ * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+ * passing fast = true). Otherwise this can take awhile.
+ */
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+ (fast ? CHECKPOINT_IMMEDIATE : 0));
+
+ /*
+ * Now we need to fetch the checkpoint record location, and also
+ * its REDO pointer. The oldest point in WAL that would be needed
+ * to restore starting from the checkpoint is precisely the REDO
+ * pointer.
+ */
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ checkpointloc = ControlFile->checkPoint;
+ startpoint = ControlFile->checkPointCopy.redo;
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * If two base backups are started at the same time (in WAL sender
+ * processes), we need to make sure that they use different
+ * checkpoints as starting locations, because we use the starting
+ * WAL location as a unique identifier for the base backup in the
+ * end-of-backup WAL record and when we write the backup history
+ * file. Perhaps it would be better generate a separate unique ID
+ * for each backup instead of forcing another checkpoint, but
+ * taking a checkpoint right after another is not that expensive
+ * either because only few buffers have been dirtied yet.
+ */
+ LWLockAcquire(WALInsertLock, LW_SHARED);
+ if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
+ {
+ XLogCtl->Insert.lastBackupStart = startpoint;
+ gotUniqueStartpoint = true;
+ }
+ LWLockRelease(WALInsertLock);
+ } while (!gotUniqueStartpoint);
XLByteToSeg(startpoint, _logId, _logSeg);
XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
+ /*
+ * Construct backup label file
+ */
+ initStringInfo(&labelfbuf);
+
/* Use the log timezone here, not the session timezone */
stamp_time = (pg_time_t) time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf),
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&stamp_time, log_timezone));
+ appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
+ startpoint.xlogid, startpoint.xrecoff, xlogfilename);
+ appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
+ checkpointloc.xlogid, checkpointloc.xrecoff);
+ appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
+ appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
/*
- * Check for existing backup label --- implies a backup is already
- * running. (XXX given that we checked forcePageWrites above, maybe
- * it would be OK to just unlink any such label file?)
+ * Okay, write the file, or return its contents to caller.
*/
- if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
+ if (exclusive)
{
- if (errno != ENOENT)
+ /*
+ * Check for existing backup label --- implies a backup is already
+ * running. (XXX given that we checked exclusiveBackup above,
+ * maybe it would be OK to just unlink any such label file?)
+ */
+ if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is already in progress"),
+ errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
+ BACKUP_LABEL_FILE)));
+
+ fp = AllocateFile(BACKUP_LABEL_FILE, "w");
+
+ if (!fp)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not stat file \"%s\": %m",
+ errmsg("could not create file \"%s\": %m",
BACKUP_LABEL_FILE)));
+ fwrite(labelfbuf.data, labelfbuf.len, 1, fp);
+ if (fflush(fp) || ferror(fp) || FreeFile(fp))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ pfree(labelfbuf.data);
}
else
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("a backup is already in progress"),
- errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
- BACKUP_LABEL_FILE)));
-
- /*
- * Okay, write the file
- */
- fp = AllocateFile(BACKUP_LABEL_FILE, "w");
- if (!fp)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create file \"%s\": %m",
- BACKUP_LABEL_FILE)));
- fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
- startpoint.xlogid, startpoint.xrecoff, xlogfilename);
- fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
- checkpointloc.xlogid, checkpointloc.xrecoff);
- fprintf(fp, "START TIME: %s\n", strfbuf);
- fprintf(fp, "LABEL: %s\n", backupidstr);
- if (fflush(fp) || ferror(fp) || FreeFile(fp))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not write file \"%s\": %m",
- BACKUP_LABEL_FILE)));
+ *labelfile = labelfbuf.data;
}
- PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
+ PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
/*
* We're done. As a convenience, return the starting WAL location.
*/
- snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
- startpoint.xlogid, startpoint.xrecoff);
- PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
+ return startpoint;
}
/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
- /* Turn off forcePageWrites on failure */
+ bool exclusive = DatumGetBool(arg);
+
+ /* Update backup counters and forcePageWrites on failure */
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
- XLogCtl->Insert.forcePageWrites = false;
+ if (exclusive)
+ {
+ Assert(XLogCtl->Insert.exclusiveBackup);
+ XLogCtl->Insert.exclusiveBackup = false;
+ }
+ else
+ {
+ Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+ XLogCtl->Insert.nonExclusiveBackups--;
+ }
+
+ if (!XLogCtl->Insert.exclusiveBackup &&
+ XLogCtl->Insert.nonExclusiveBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
LWLockRelease(WALInsertLock);
}
@@ -8565,6 +9168,27 @@ pg_start_backup_callback(int code, Datum arg)
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
+ XLogRecPtr stoppoint;
+ char stopxlogstr[MAXFNAMELEN];
+
+ stoppoint = do_pg_stop_backup(NULL, true);
+
+ snprintf(stopxlogstr, sizeof(stopxlogstr), "%X/%X",
+ stoppoint.xlogid, stoppoint.xrecoff);
+ PG_RETURN_TEXT_P(cstring_to_text(stopxlogstr));
+}
+
+/*
+ * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
+ * function.
+
+ * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
+ * the non-exclusive backup specified by 'labelfile'.
+ */
+XLogRecPtr
+do_pg_stop_backup(char *labelfile, bool waitforarchive)
+{
+ bool exclusive = (labelfile == NULL);
XLogRecPtr startpoint;
XLogRecPtr stoppoint;
XLogRecData rdata;
@@ -8580,15 +9204,15 @@ pg_stop_backup(PG_FUNCTION_ARGS)
FILE *lfp;
FILE *fp;
char ch;
- int ich;
int seconds_before_warning;
int waits = 0;
bool reported_waiting = false;
+ char *remaining;
- if (!superuser())
+ if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
- (errmsg("must be superuser to run a backup"))));
+ (errmsg("must be superuser or replication role to run a backup"))));
if (RecoveryInProgress())
ereport(ERROR,
@@ -8603,38 +9227,88 @@ pg_stop_backup(PG_FUNCTION_ARGS)
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
/*
- * OK to clear forcePageWrites
+ * OK to update backup counters and forcePageWrites
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
- XLogCtl->Insert.forcePageWrites = false;
+ if (exclusive)
+ XLogCtl->Insert.exclusiveBackup = false;
+ else
+ {
+ /*
+ * The user-visible pg_start/stop_backup() functions that operate on
+ * exclusive backups can be called at any time, but for non-exclusive
+ * backups, it is expected that each do_pg_start_backup() call is
+ * matched by exactly one do_pg_stop_backup() call.
+ */
+ Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+ XLogCtl->Insert.nonExclusiveBackups--;
+ }
+
+ if (!XLogCtl->Insert.exclusiveBackup &&
+ XLogCtl->Insert.nonExclusiveBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
LWLockRelease(WALInsertLock);
- /*
- * Open the existing label file
- */
- lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
- if (!lfp)
+ if (exclusive)
{
- if (errno != ENOENT)
+ /*
+ * Read the existing label file into memory.
+ */
+ struct stat statbuf;
+ int r;
+
+ if (stat(BACKUP_LABEL_FILE, &statbuf))
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is not in progress")));
+ }
+
+ lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+ if (!lfp)
+ {
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("a backup is not in progress")));
+ }
+ labelfile = palloc(statbuf.st_size + 1);
+ r = fread(labelfile, statbuf.st_size, 1, lfp);
+ labelfile[statbuf.st_size] = '\0';
+
+ /*
+ * Close and remove the backup label file
+ */
+ if (r != 1 || ferror(lfp) || FreeFile(lfp))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ if (unlink(BACKUP_LABEL_FILE) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
}
/*
* Read and parse the START WAL LOCATION line (this code is pretty crude,
* but we are not expecting any variability in the file format).
*/
- if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
+ if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
&startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
&ch) != 4 || ch != '\n')
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
/*
* Write the backup-end xlog record
@@ -8677,8 +9351,7 @@ pg_stop_backup(PG_FUNCTION_ARGS)
fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
/* transfer remaining lines from label to history file */
- while ((ich = fgetc(lfp)) != EOF)
- fputc(ich, fp);
+ fprintf(fp, "%s", remaining);
fprintf(fp, "STOP TIME: %s\n", strfbuf);
if (fflush(fp) || ferror(fp) || FreeFile(fp))
ereport(ERROR,
@@ -8687,20 +9360,6 @@ pg_stop_backup(PG_FUNCTION_ARGS)
histfilepath)));
/*
- * Close and remove the backup label file
- */
- if (ferror(lfp) || FreeFile(lfp))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not read file \"%s\": %m",
- BACKUP_LABEL_FILE)));
- if (unlink(BACKUP_LABEL_FILE) != 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m",
- BACKUP_LABEL_FILE)));
-
- /*
* Clean out any no-longer-needed history files. As a side effect, this
* will post a .ready file for the newly created history file, notifying
* the archiver that history file may be archived immediately.
@@ -8726,7 +9385,7 @@ pg_stop_backup(PG_FUNCTION_ARGS)
* wish to wait, you can set statement_timeout. Also, some notices are
* issued to clue in anyone who might be doing this interactively.
*/
- if (XLogArchivingActive())
+ if (waitforarchive && XLogArchivingActive())
{
XLByteToPrevSeg(stoppoint, _logId, _logSeg);
XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
@@ -8767,16 +9426,41 @@ pg_stop_backup(PG_FUNCTION_ARGS)
ereport(NOTICE,
(errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
}
- else
+ else if (waitforarchive)
ereport(NOTICE,
(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
/*
* We're done. As a convenience, return the ending WAL location.
*/
- snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
- stoppoint.xlogid, stoppoint.xrecoff);
- PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
+ return stoppoint;
+}
+
+
+/*
+ * do_pg_abort_backup: abort a running backup
+ *
+ * This does just the most basic steps of do_pg_stop_backup(), by taking the
+ * system out of backup mode, thus making it a lot more safe to call from
+ * an error handler.
+ *
+ * NB: This is only for aborting a non-exclusive backup that doesn't write
+ * backup_label. A backup started with pg_stop_backup() needs to be finished
+ * with pg_stop_backup().
+ */
+void
+do_pg_abort_backup(void)
+{
+ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+ XLogCtl->Insert.nonExclusiveBackups--;
+
+ if (!XLogCtl->Insert.exclusiveBackup &&
+ XLogCtl->Insert.nonExclusiveBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
+ LWLockRelease(WALInsertLock);
}
/*
@@ -8810,6 +9494,51 @@ pg_switch_xlog(PG_FUNCTION_ARGS)
}
/*
+ * pg_create_restore_point: a named point for restore
+ */
+Datum
+pg_create_restore_point(PG_FUNCTION_ARGS)
+{
+ text *restore_name = PG_GETARG_TEXT_P(0);
+ char *restore_name_str;
+ XLogRecPtr restorepoint;
+ char location[MAXFNAMELEN];
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser to create a restore point"))));
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ (errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery."))));
+
+ if (!XLogIsNeeded())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL level not sufficient for creating a restore point"),
+ errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
+
+ restore_name_str = text_to_cstring(restore_name);
+
+ if (strlen(restore_name_str) >= MAXFNAMELEN)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
+
+ restorepoint = XLogRestorePoint(restore_name_str);
+
+ /*
+ * As a convenience, return the WAL location of the restore point record
+ */
+ snprintf(location, sizeof(location), "%X/%X",
+ restorepoint.xlogid, restorepoint.xrecoff);
+ PG_RETURN_TEXT_P(cstring_to_text(location));
+}
+
+/*
* Report the current WAL write location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to an external
@@ -8895,6 +9624,25 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
}
/*
+ * Get latest redo apply position.
+ *
+ * Exported to allow WALReceiver to read the pointer directly.
+ */
+XLogRecPtr
+GetXLogReplayRecPtr(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogRecPtr recptr;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ recptr = xlogctl->recoveryLastRecPtr;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return recptr;
+}
+
+/*
* Report the last WAL replay location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to read-only
@@ -8903,14 +9651,10 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
Datum
pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
{
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
XLogRecPtr recptr;
char location[MAXFNAMELEN];
- SpinLockAcquire(&xlogctl->info_lck);
- recptr = xlogctl->recoveryLastRecPtr;
- SpinLockRelease(&xlogctl->info_lck);
+ recptr = GetXLogReplayRecPtr();
if (recptr.xlogid == 0 && recptr.xrecoff == 0)
PG_RETURN_NULL();
@@ -9214,11 +9958,27 @@ startupproc_quickdie(SIGNAL_ARGS)
}
+/* SIGUSR1: let latch facility handle the signal */
+static void
+StartupProcSigUsr1Handler(SIGNAL_ARGS)
+{
+ latch_sigusr1_handler();
+}
+
+/* SIGUSR2: set flag to finish recovery */
+static void
+StartupProcTriggerHandler(SIGNAL_ARGS)
+{
+ promote_triggered = true;
+ WakeupRecovery();
+}
+
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
got_SIGHUP = true;
+ WakeupRecovery();
}
/* SIGTERM: set flag to abort redo and exit */
@@ -9229,6 +9989,7 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
proc_exit(1);
else
shutdown_requested = true;
+ WakeupRecovery();
}
/* Handle SIGHUP and SIGTERM signals of startup process */
@@ -9288,8 +10049,8 @@ StartupProcessMain(void)
else
pqsignal(SIGALRM, SIG_IGN);
pqsignal(SIGPIPE, SIG_IGN);
- pqsignal(SIGUSR1, SIG_IGN);
- pqsignal(SIGUSR2, SIG_IGN);
+ pqsignal(SIGUSR1, StartupProcSigUsr1Handler);
+ pqsignal(SIGUSR2, StartupProcTriggerHandler);
/*
* Reset some signals that are accepted by postmaster but not here
@@ -9472,16 +10233,17 @@ retry:
}
/*
- * Data not here yet, so check for trigger then sleep.
+ * Data not here yet, so check for trigger then sleep for
+ * five seconds like in the WAL file polling case below.
*/
if (CheckForStandbyTrigger())
- goto triggered;
+ goto retry;
/*
- * When streaming is active, we want to react quickly when
- * the next WAL record arrives, so sleep only a bit.
+ * Wait for more WAL to arrive, or timeout to be reached
*/
- pg_usleep(100000L); /* 100ms */
+ WaitLatch(&XLogCtl->recoveryWakeupLatch, 5000000L);
+ ResetLatch(&XLogCtl->recoveryWakeupLatch);
}
else
{
@@ -9510,13 +10272,24 @@ retry:
{
/*
* We've exhausted all options for retrieving the
- * file. Retry ...
+ * file. Retry.
*/
failedSources = 0;
/*
- * ... but sleep first if it hasn't been long since
- * last attempt.
+ * Before we sleep, re-scan for possible new timelines
+ * if we were requested to recover to the latest
+ * timeline.
+ */
+ if (recoveryTargetIsLatest)
+ {
+ if (rescanLatestTimeLine())
+ continue;
+ }
+
+ /*
+ * If it hasn't been long since last attempt, sleep to
+ * avoid busy-waiting.
*/
now = (pg_time_t) time(NULL);
if ((now - last_fail_time) < 5)
@@ -9734,14 +10507,28 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
}
/*
- * Check to see if the trigger file exists. If it does, request postmaster
- * to shut down walreceiver, wait for it to exit, remove the trigger
- * file, and return true.
+ * Check to see whether the user-specified trigger file exists and whether a
+ * promote request has arrived. If either condition holds, request postmaster
+ * to shut down walreceiver, wait for it to exit, and return true.
*/
static bool
CheckForStandbyTrigger(void)
{
struct stat stat_buf;
+ static bool triggered = false;
+
+ if (triggered)
+ return true;
+
+ if (promote_triggered)
+ {
+ ereport(LOG,
+ (errmsg("received promote request")));
+ ShutdownWalRcv();
+ promote_triggered = false;
+ triggered = true;
+ return true;
+ }
if (TriggerFile == NULL)
return false;
@@ -9752,7 +10539,39 @@ CheckForStandbyTrigger(void)
(errmsg("trigger file found: %s", TriggerFile)));
ShutdownWalRcv();
unlink(TriggerFile);
+ triggered = true;
return true;
}
return false;
}
+
+/*
+ * Check to see if a promote request has arrived. Should be
+ * called by postmaster after receiving SIGUSR1.
+ */
+bool
+CheckPromoteSignal(void)
+{
+ struct stat stat_buf;
+
+ if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ /*
+ * Since we are in a signal handler, it's not safe to elog. We
+ * silently ignore any error from unlink.
+ */
+ unlink(PROMOTE_SIGNAL_FILE);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Wake up startup process to replay newly arrived WAL, or to notice that
+ * failover has been requested.
+ */
+void
+WakeupRecovery(void)
+{
+ SetLatch(&XLogCtl->recoveryWakeupLatch);
+}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 3d7c7cf69e..cbb61bb899 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -8,10 +8,10 @@
* None of this code is used during normal system operation.
*
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.71 2010/07/08 16:08:30 tgl Exp $
+ * src/backend/access/transam/xlogutils.c
*
*-------------------------------------------------------------------------
*/
@@ -68,7 +68,7 @@ log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
*/
if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
{
- char *path = relpath(node, forkno);
+ char *path = relpathperm(node, forkno);
if (present)
elog(DEBUG1, "page %u of relation %s is uninitialized",
@@ -133,7 +133,7 @@ forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
{
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
{
- char *path = relpath(hentry->key.node, forkno);
+ char *path = relpathperm(hentry->key.node, forkno);
elog(DEBUG2, "page %u of relation %s has been dropped",
hentry->key.blkno, path);
@@ -166,7 +166,7 @@ forget_invalid_pages_db(Oid dbid)
{
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
{
- char *path = relpath(hentry->key.node, hentry->key.forkno);
+ char *path = relpathperm(hentry->key.node, hentry->key.forkno);
elog(DEBUG2, "page %u of relation %s has been dropped",
hentry->key.blkno, path);
@@ -200,7 +200,7 @@ XLogCheckInvalidPages(void)
*/
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
{
- char *path = relpath(hentry->key.node, hentry->key.forkno);
+ char *path = relpathperm(hentry->key.node, hentry->key.forkno);
if (hentry->present)
elog(WARNING, "page %u of relation %s was uninitialized",
@@ -276,7 +276,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
Assert(blkno != P_NEW);
/* Open the relation at smgr level */
- smgr = smgropen(rnode);
+ smgr = smgropen(rnode, InvalidBackendId);
/*
* Create the target file if it doesn't already exist. This lets us cope
@@ -293,7 +293,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
if (blkno < lastblock)
{
/* page exists in file */
- buffer = ReadBufferWithoutRelcache(rnode, false, forknum, blkno,
+ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
mode, NULL);
}
else
@@ -312,7 +312,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
{
if (buffer != InvalidBuffer)
ReleaseBuffer(buffer);
- buffer = ReadBufferWithoutRelcache(rnode, false, forknum,
+ buffer = ReadBufferWithoutRelcache(rnode, forknum,
P_NEW, mode, NULL);
lastblock++;
}
@@ -377,6 +377,8 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
rel->rd_rel = &fakeentry->pgc;
rel->rd_node = rnode;
+ /* We will never be working with temp rels during recovery */
+ rel->rd_backend = InvalidBackendId;
/* We don't know the name of the relation; use relfilenode instead */
sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
diff --git a/src/backend/bootstrap/.cvsignore b/src/backend/bootstrap/.cvsignore
deleted file mode 100644
index 5201be9886..0000000000
--- a/src/backend/bootstrap/.cvsignore
+++ /dev/null
@@ -1,2 +0,0 @@
-bootparse.c
-bootscanner.c
diff --git a/src/backend/bootstrap/.gitignore b/src/backend/bootstrap/.gitignore
new file mode 100644
index 0000000000..1ffe8ca39e
--- /dev/null
+++ b/src/backend/bootstrap/.gitignore
@@ -0,0 +1,2 @@
+/bootparse.c
+/bootscanner.c
diff --git a/src/backend/bootstrap/Makefile b/src/backend/bootstrap/Makefile
index f1dd7e12c3..a77d864800 100644
--- a/src/backend/bootstrap/Makefile
+++ b/src/backend/bootstrap/Makefile
@@ -2,7 +2,7 @@
#
# Makefile for the bootstrap module
#
-# $PostgreSQL: pgsql/src/backend/bootstrap/Makefile,v 1.39 2010/01/05 03:56:52 tgl Exp $
+# src/backend/bootstrap/Makefile
#
#-------------------------------------------------------------------------
@@ -12,7 +12,7 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
-OBJS= bootparse.o bootstrap.o
+OBJS= bootparse.o bootstrap.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 9cc68501ff..a9d2428698 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -4,12 +4,12 @@
* bootparse.y
* yacc grammar for the "bootstrap" mode (BKI file format)
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.105 2010/02/07 20:48:09 tgl Exp $
+ * src/backend/bootstrap/bootparse.y
*
*-------------------------------------------------------------------------
*/
@@ -219,6 +219,7 @@ Boot_CreateStmt:
$3,
tupdesc,
RELKIND_RELATION,
+ RELPERSISTENCE_PERMANENT,
shared_relation,
mapped_relation,
true);
@@ -238,6 +239,7 @@ Boot_CreateStmt:
tupdesc,
NIL,
RELKIND_RELATION,
+ RELPERSISTENCE_PERMANENT,
shared_relation,
mapped_relation,
true,
@@ -342,6 +344,7 @@ boot_index_param:
n->name = $1;
n->expr = NULL;
n->indexcolname = NULL;
+ n->collation = NIL;
n->opclass = list_make1(makeString($2));
n->ordering = SORTBY_DEFAULT;
n->nulls_ordering = SORTBY_NULLS_DEFAULT;
diff --git a/src/backend/bootstrap/bootscanner.l b/src/backend/bootstrap/bootscanner.l
index f2c8f824bd..fba020008f 100644
--- a/src/backend/bootstrap/bootscanner.l
+++ b/src/backend/bootstrap/bootscanner.l
@@ -4,12 +4,12 @@
* bootscanner.l
* a lexical scanner for the bootstrap parser
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/bootstrap/bootscanner.l,v 1.51 2010/01/02 16:57:36 momjian Exp $
+ * src/backend/bootstrap/bootscanner.l
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 058542fecc..27e0e482c9 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -4,12 +4,12 @@
* routines to support running postgres in 'bootstrap' mode
* bootstrap mode is used to create the initial template database
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.261 2010/04/20 01:38:52 tgl Exp $
+ * src/backend/bootstrap/bootstrap.c
*
*-------------------------------------------------------------------------
*/
@@ -27,6 +27,7 @@
#include "access/xact.h"
#include "bootstrap/bootstrap.h"
#include "catalog/index.h"
+#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
@@ -77,7 +78,8 @@ int numattr; /* number of attributes for cur. rel */
/*
* Basic information associated with each type. This is used before
- * pg_type is created.
+ * pg_type is filled, so it has to cover the datatypes used as column types
+ * in the core "bootstrapped" catalogs.
*
* XXX several of these input/output functions do catalog scans
* (e.g., F_REGPROCIN scans pg_proc). this obviously creates some
@@ -92,54 +94,57 @@ struct typinfo
bool byval;
char align;
char storage;
+ Oid collation;
Oid inproc;
Oid outproc;
};
static const struct typinfo TypInfo[] = {
- {"bool", BOOLOID, 0, 1, true, 'c', 'p',
+ {"bool", BOOLOID, 0, 1, true, 'c', 'p', InvalidOid,
F_BOOLIN, F_BOOLOUT},
- {"bytea", BYTEAOID, 0, -1, false, 'i', 'x',
+ {"bytea", BYTEAOID, 0, -1, false, 'i', 'x', InvalidOid,
F_BYTEAIN, F_BYTEAOUT},
- {"char", CHAROID, 0, 1, true, 'c', 'p',
+ {"char", CHAROID, 0, 1, true, 'c', 'p', InvalidOid,
F_CHARIN, F_CHAROUT},
- {"int2", INT2OID, 0, 2, true, 's', 'p',
+ {"int2", INT2OID, 0, 2, true, 's', 'p', InvalidOid,
F_INT2IN, F_INT2OUT},
- {"int4", INT4OID, 0, 4, true, 'i', 'p',
+ {"int4", INT4OID, 0, 4, true, 'i', 'p', InvalidOid,
F_INT4IN, F_INT4OUT},
- {"float4", FLOAT4OID, 0, 4, FLOAT4PASSBYVAL, 'i', 'p',
+ {"float4", FLOAT4OID, 0, 4, FLOAT4PASSBYVAL, 'i', 'p', InvalidOid,
F_FLOAT4IN, F_FLOAT4OUT},
- {"name", NAMEOID, CHAROID, NAMEDATALEN, false, 'c', 'p',
+ {"name", NAMEOID, CHAROID, NAMEDATALEN, false, 'c', 'p', InvalidOid,
F_NAMEIN, F_NAMEOUT},
- {"regclass", REGCLASSOID, 0, 4, true, 'i', 'p',
+ {"regclass", REGCLASSOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGCLASSIN, F_REGCLASSOUT},
- {"regproc", REGPROCOID, 0, 4, true, 'i', 'p',
+ {"regproc", REGPROCOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGPROCIN, F_REGPROCOUT},
- {"regtype", REGTYPEOID, 0, 4, true, 'i', 'p',
+ {"regtype", REGTYPEOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGTYPEIN, F_REGTYPEOUT},
- {"text", TEXTOID, 0, -1, false, 'i', 'x',
+ {"text", TEXTOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
F_TEXTIN, F_TEXTOUT},
- {"oid", OIDOID, 0, 4, true, 'i', 'p',
+ {"oid", OIDOID, 0, 4, true, 'i', 'p', InvalidOid,
F_OIDIN, F_OIDOUT},
- {"tid", TIDOID, 0, 6, false, 's', 'p',
+ {"tid", TIDOID, 0, 6, false, 's', 'p', InvalidOid,
F_TIDIN, F_TIDOUT},
- {"xid", XIDOID, 0, 4, true, 'i', 'p',
+ {"xid", XIDOID, 0, 4, true, 'i', 'p', InvalidOid,
F_XIDIN, F_XIDOUT},
- {"cid", CIDOID, 0, 4, true, 'i', 'p',
+ {"cid", CIDOID, 0, 4, true, 'i', 'p', InvalidOid,
F_CIDIN, F_CIDOUT},
- {"int2vector", INT2VECTOROID, INT2OID, -1, false, 'i', 'p',
+ {"pg_node_tree", PGNODETREEOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
+ F_PG_NODE_TREE_IN, F_PG_NODE_TREE_OUT},
+ {"int2vector", INT2VECTOROID, INT2OID, -1, false, 'i', 'p', InvalidOid,
F_INT2VECTORIN, F_INT2VECTOROUT},
- {"oidvector", OIDVECTOROID, OIDOID, -1, false, 'i', 'p',
+ {"oidvector", OIDVECTOROID, OIDOID, -1, false, 'i', 'p', InvalidOid,
F_OIDVECTORIN, F_OIDVECTOROUT},
- {"_int4", INT4ARRAYOID, INT4OID, -1, false, 'i', 'x',
+ {"_int4", INT4ARRAYOID, INT4OID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_text", 1009, TEXTOID, -1, false, 'i', 'x',
+ {"_text", 1009, TEXTOID, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_oid", 1028, OIDOID, -1, false, 'i', 'x',
+ {"_oid", 1028, OIDOID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_char", 1002, CHAROID, -1, false, 'i', 'x',
+ {"_char", 1002, CHAROID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_aclitem", 1034, ACLITEMOID, -1, false, 'i', 'x',
+ {"_aclitem", 1034, ACLITEMOID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT}
};
@@ -724,6 +729,7 @@ DefineAttr(char *name, char *type, int attnum)
attrtypes[attnum]->attbyval = Ap->am_typ.typbyval;
attrtypes[attnum]->attstorage = Ap->am_typ.typstorage;
attrtypes[attnum]->attalign = Ap->am_typ.typalign;
+ attrtypes[attnum]->attcollation = Ap->am_typ.typcollation;
/* if an array type, assume 1-dimensional attribute */
if (Ap->am_typ.typelem != InvalidOid && Ap->am_typ.typlen < 0)
attrtypes[attnum]->attndims = 1;
@@ -737,6 +743,7 @@ DefineAttr(char *name, char *type, int attnum)
attrtypes[attnum]->attbyval = TypInfo[typeoid].byval;
attrtypes[attnum]->attstorage = TypInfo[typeoid].storage;
attrtypes[attnum]->attalign = TypInfo[typeoid].align;
+ attrtypes[attnum]->attcollation = TypInfo[typeoid].collation;
/* if an array type, assume 1-dimensional attribute */
if (TypInfo[typeoid].elem != InvalidOid &&
attrtypes[attnum]->attlen < 0)
@@ -1144,7 +1151,7 @@ build_indices(void)
heap = heap_open(ILHead->il_heap, NoLock);
ind = index_open(ILHead->il_ind, NoLock);
- index_build(heap, ind, ILHead->il_info, false);
+ index_build(heap, ind, ILHead->il_info, false, false);
index_close(ind, NoLock);
heap_close(heap, NoLock);
diff --git a/src/backend/catalog/.cvsignore b/src/backend/catalog/.cvsignore
deleted file mode 100644
index 242e32155f..0000000000
--- a/src/backend/catalog/.cvsignore
+++ /dev/null
@@ -1,4 +0,0 @@
-postgres.bki
-postgres.description
-postgres.shdescription
-schemapg.h
diff --git a/src/backend/catalog/.gitignore b/src/backend/catalog/.gitignore
new file mode 100644
index 0000000000..557af3c0e5
--- /dev/null
+++ b/src/backend/catalog/.gitignore
@@ -0,0 +1,4 @@
+/postgres.bki
+/postgres.description
+/postgres.shdescription
+/schemapg.h
diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index bcd4e31fb9..8526d7d555 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -4,10 +4,10 @@
# Perl module that extracts info from catalog headers into Perl
# data structures
#
-# Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+# Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
-# $PostgreSQL: pgsql/src/backend/catalog/Catalog.pm,v 1.3 2010/01/05 20:23:32 tgl Exp $
+# src/backend/catalog/Catalog.pm
#
#----------------------------------------------------------------------
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index a84d4563f5..82adb5be89 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -2,7 +2,7 @@
#
# Makefile for backend/catalog
#
-# $PostgreSQL: pgsql/src/backend/catalog/Makefile,v 1.78 2010/05/13 11:49:48 petere Exp $
+# src/backend/catalog/Makefile
#
#-------------------------------------------------------------------------
@@ -11,9 +11,10 @@ top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
- pg_aggregate.o pg_constraint.o pg_conversion.o pg_depend.o pg_enum.o \
- pg_inherits.o pg_largeobject.o pg_namespace.o pg_operator.o pg_proc.o \
- pg_db_role_setting.o pg_shdepend.o pg_type.o pgxc_class.o storage.o toasting.o
+ objectaddress.o pg_aggregate.o pg_collation.o pg_constraint.o pg_conversion.o \
+ pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \
+ pg_operator.o pg_proc.o pg_db_role_setting.o pg_shdepend.o pg_type.o \
+ pgxc_class.o storage.o toasting.o
BKIFILES = postgres.bki postgres.description postgres.shdescription
@@ -35,10 +36,11 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\
pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \
pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
- pg_ts_parser.h pg_ts_template.h \
+ pg_ts_parser.h pg_ts_template.h pg_extension.h \
pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
pgxc_class.h \
- pg_default_acl.h \
+ pg_foreign_table.h \
+ pg_default_acl.h pg_seclabel.h pg_collation.h \
toasting.h indexing.h \
)
diff --git a/src/backend/catalog/README b/src/backend/catalog/README
index 6d7e0e23c1..fce01ea431 100644
--- a/src/backend/catalog/README
+++ b/src/backend/catalog/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/catalog/README,v 1.14 2010/01/05 01:06:56 tgl Exp $
+src/backend/catalog/README
System Catalog
==============
diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c
index 0470a70273..df32731b87 100644
--- a/src/backend/catalog/aclchk.c
+++ b/src/backend/catalog/aclchk.c
@@ -3,12 +3,12 @@
* aclchk.c
* Routines to check access control permissions.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyr