summaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
authorPavan Deolasee2015-05-05 09:19:18 +0000
committerPavan Deolasee2015-05-05 09:19:18 +0000
commit73fa25c67cbfa24c03e28c96bf356f2592671730 (patch)
tree10ded7e26abd78d93658cb72fc5cb9d4672eff2a /src/backend
parentda4d108859bcd7a308ca75aba54281e32968822c (diff)
parent4a9ab6d8619817f9e3989c99b65140e19041dab7 (diff)
Merge branch 'XL_MASTER_MERGE_9_4' into XL_NEW_MASTER
Conflicts: src/test/regress/expected/aggregates.out src/test/regress/expected/create_index.out src/test/regress/expected/inherit.out src/test/regress/expected/join.out src/test/regress/expected/window.out src/test/regress/expected/with.out
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/Makefile50
-rw-r--r--src/backend/access/Makefile2
-rw-r--r--src/backend/access/common/heaptuple.c97
-rw-r--r--src/backend/access/common/indextuple.c18
-rw-r--r--src/backend/access/common/printtup.c127
-rw-r--r--src/backend/access/common/reloptions.c66
-rw-r--r--src/backend/access/common/scankey.c2
-rw-r--r--src/backend/access/common/tupconvert.c5
-rw-r--r--src/backend/access/common/tupdesc.c56
-rw-r--r--src/backend/access/gin/Makefile2
-rw-r--r--src/backend/access/gin/README166
-rw-r--r--src/backend/access/gin/ginarrayproc.c89
-rw-r--r--src/backend/access/gin/ginbtree.c755
-rw-r--r--src/backend/access/gin/ginbulk.c4
-rw-r--r--src/backend/access/gin/gindatapage.c1921
-rw-r--r--src/backend/access/gin/ginentrypage.c326
-rw-r--r--src/backend/access/gin/ginfast.c30
-rw-r--r--src/backend/access/gin/ginget.c936
-rw-r--r--src/backend/access/gin/gininsert.c207
-rw-r--r--src/backend/access/gin/ginlogic.c248
-rw-r--r--src/backend/access/gin/ginpostinglist.c424
-rw-r--r--src/backend/access/gin/ginscan.c6
-rw-r--r--src/backend/access/gin/ginutil.c34
-rw-r--r--src/backend/access/gin/ginvacuum.c354
-rw-r--r--src/backend/access/gin/ginxlog.c996
-rw-r--r--src/backend/access/gist/README2
-rw-r--r--src/backend/access/gist/gist.c109
-rw-r--r--src/backend/access/gist/gistbuild.c39
-rw-r--r--src/backend/access/gist/gistbuildbuffers.c91
-rw-r--r--src/backend/access/gist/gistget.c21
-rw-r--r--src/backend/access/gist/gistproc.c83
-rw-r--r--src/backend/access/gist/gistscan.c55
-rw-r--r--src/backend/access/gist/gistsplit.c515
-rw-r--r--src/backend/access/gist/gistutil.c247
-rw-r--r--src/backend/access/gist/gistvacuum.c9
-rw-r--r--src/backend/access/gist/gistxlog.c264
-rw-r--r--src/backend/access/hash/README133
-rw-r--r--src/backend/access/hash/hash.c23
-rw-r--r--src/backend/access/hash/hashfunc.c16
-rw-r--r--src/backend/access/hash/hashinsert.c61
-rw-r--r--src/backend/access/hash/hashovfl.c18
-rw-r--r--src/backend/access/hash/hashpage.c46
-rw-r--r--src/backend/access/hash/hashscan.c2
-rw-r--r--src/backend/access/hash/hashsearch.c65
-rw-r--r--src/backend/access/hash/hashsort.c11
-rw-r--r--src/backend/access/hash/hashutil.c6
-rw-r--r--src/backend/access/heap/README.HOT28
-rw-r--r--src/backend/access/heap/README.tuplock139
-rw-r--r--src/backend/access/heap/heapam.c4675
-rw-r--r--src/backend/access/heap/hio.c30
-rw-r--r--src/backend/access/heap/pruneheap.c88
-rw-r--r--src/backend/access/heap/rewriteheap.c662
-rw-r--r--src/backend/access/heap/syncscan.c8
-rw-r--r--src/backend/access/heap/tuptoaster.c483
-rw-r--r--src/backend/access/heap/visibilitymap.c83
-rw-r--r--src/backend/access/index/genam.c65
-rw-r--r--src/backend/access/index/indexam.c41
-rw-r--r--src/backend/access/nbtree/README184
-rw-r--r--src/backend/access/nbtree/nbtcompare.c4
-rw-r--r--src/backend/access/nbtree/nbtinsert.c443
-rw-r--r--src/backend/access/nbtree/nbtpage.c1082
-rw-r--r--src/backend/access/nbtree/nbtree.c111
-rw-r--r--src/backend/access/nbtree/nbtsearch.c98
-rw-r--r--src/backend/access/nbtree/nbtsort.c54
-rw-r--r--src/backend/access/nbtree/nbtutils.c119
-rw-r--r--src/backend/access/nbtree/nbtxlog.c1104
-rw-r--r--src/backend/access/rmgrdesc/Makefile15
-rw-r--r--src/backend/access/rmgrdesc/clogdesc.c41
-rw-r--r--src/backend/access/rmgrdesc/dbasedesc.c43
-rw-r--r--src/backend/access/rmgrdesc/gindesc.c188
-rw-r--r--src/backend/access/rmgrdesc/gistdesc.c68
-rw-r--r--src/backend/access/rmgrdesc/hashdesc.c22
-rw-r--r--src/backend/access/rmgrdesc/heapdesc.c202
-rw-r--r--src/backend/access/rmgrdesc/mxactdesc.c80
-rw-r--r--src/backend/access/rmgrdesc/nbtdesc.c173
-rw-r--r--src/backend/access/rmgrdesc/relmapdesc.c33
-rw-r--r--src/backend/access/rmgrdesc/seqdesc.c36
-rw-r--r--src/backend/access/rmgrdesc/smgrdesc.c45
-rw-r--r--src/backend/access/rmgrdesc/spgdesc.c89
-rw-r--r--src/backend/access/rmgrdesc/standbydesc.c65
-rw-r--r--src/backend/access/rmgrdesc/tblspcdesc.c40
-rw-r--r--src/backend/access/rmgrdesc/xactdesc.c197
-rw-r--r--src/backend/access/rmgrdesc/xlogdesc.c145
-rw-r--r--src/backend/access/spgist/README50
-rw-r--r--src/backend/access/spgist/spgdoinsert.c119
-rw-r--r--src/backend/access/spgist/spginsert.c42
-rw-r--r--src/backend/access/spgist/spgkdtreeproc.c2
-rw-r--r--src/backend/access/spgist/spgquadtreeproc.c4
-rw-r--r--src/backend/access/spgist/spgscan.c7
-rw-r--r--src/backend/access/spgist/spgtextproc.c142
-rw-r--r--src/backend/access/spgist/spgutils.c53
-rw-r--r--src/backend/access/spgist/spgvacuum.c34
-rw-r--r--src/backend/access/spgist/spgxlog.c491
-rw-r--r--src/backend/access/transam/Makefile3
-rw-r--r--src/backend/access/transam/README125
-rw-r--r--src/backend/access/transam/clog.c39
-rw-r--r--src/backend/access/transam/multixact.c1599
-rw-r--r--src/backend/access/transam/recovery.conf.sample23
-rw-r--r--src/backend/access/transam/rmgr.c29
-rw-r--r--src/backend/access/transam/slru.c137
-rw-r--r--src/backend/access/transam/subtrans.c6
-rw-r--r--src/backend/access/transam/timeline.c597
-rw-r--r--src/backend/access/transam/transam.c9
-rw-r--r--src/backend/access/transam/twophase.c369
-rw-r--r--src/backend/access/transam/twophase_rmgr.c2
-rw-r--r--src/backend/access/transam/varsup.c43
-rw-r--r--src/backend/access/transam/xact.c550
-rw-r--r--src/backend/access/transam/xlog.c7331
-rw-r--r--src/backend/access/transam/xlogarchive.c715
-rw-r--r--src/backend/access/transam/xlogfuncs.c253
-rw-r--r--src/backend/access/transam/xlogreader.c987
-rw-r--r--src/backend/access/transam/xlogutils.c28
-rw-r--r--src/backend/bootstrap/Makefile16
-rw-r--r--src/backend/bootstrap/bootparse.y86
-rw-r--r--src/backend/bootstrap/bootscanner.l10
-rw-r--r--src/backend/bootstrap/bootstrap.c94
-rw-r--r--src/backend/catalog/Catalog.pm310
-rw-r--r--src/backend/catalog/Makefile8
-rw-r--r--src/backend/catalog/README2
-rw-r--r--src/backend/catalog/aclchk.c150
-rw-r--r--src/backend/catalog/catalog.c347
-rw-r--r--src/backend/catalog/dependency.c1060
-rw-r--r--src/backend/catalog/genbki.pl585
-rw-r--r--src/backend/catalog/heap.c237
-rw-r--r--src/backend/catalog/index.c619
-rw-r--r--src/backend/catalog/indexing.c5
-rw-r--r--src/backend/catalog/information_schema.sql228
-rw-r--r--src/backend/catalog/namespace.c244
-rw-r--r--src/backend/catalog/objectaccess.c128
-rw-r--r--src/backend/catalog/objectaddress.c2360
-rw-r--r--src/backend/catalog/pg_aggregate.c435
-rw-r--r--src/backend/catalog/pg_collation.c10
-rw-r--r--src/backend/catalog/pg_constraint.c67
-rw-r--r--src/backend/catalog/pg_conversion.c10
-rw-r--r--src/backend/catalog/pg_db_role_setting.c18
-rw-r--r--src/backend/catalog/pg_depend.c29
-rw-r--r--src/backend/catalog/pg_enum.c57
-rw-r--r--src/backend/catalog/pg_inherits.c7
-rw-r--r--src/backend/catalog/pg_largeobject.c123
-rw-r--r--src/backend/catalog/pg_namespace.c6
-rw-r--r--src/backend/catalog/pg_operator.c27
-rw-r--r--src/backend/catalog/pg_proc.c72
-rw-r--r--src/backend/catalog/pg_range.c5
-rw-r--r--src/backend/catalog/pg_shdepend.c136
-rw-r--r--src/backend/catalog/pg_type.c16
-rw-r--r--src/backend/catalog/pgxc_class.c1
-rw-r--r--src/backend/catalog/sql_features.txt14
-rw-r--r--src/backend/catalog/storage.c104
-rw-r--r--src/backend/catalog/system_views.sql155
-rw-r--r--src/backend/catalog/toasting.c68
-rw-r--r--src/backend/commands/Makefile6
-rw-r--r--src/backend/commands/aggregatecmds.c333
-rw-r--r--src/backend/commands/alter.c843
-rw-r--r--src/backend/commands/analyze.c96
-rw-r--r--src/backend/commands/async.c179
-rw-r--r--src/backend/commands/cluster.c295
-rw-r--r--src/backend/commands/collationcmds.c236
-rw-r--r--src/backend/commands/comment.c35
-rw-r--r--src/backend/commands/constraint.c6
-rw-r--r--src/backend/commands/conversioncmds.c213
-rw-r--r--src/backend/commands/copy.c565
-rw-r--r--src/backend/commands/createas.c101
-rw-r--r--src/backend/commands/dbcommands.c159
-rw-r--r--src/backend/commands/define.c35
-rw-r--r--src/backend/commands/discard.c10
-rw-r--r--src/backend/commands/dropcmds.c266
-rw-r--r--src/backend/commands/event_trigger.c1301
-rw-r--r--src/backend/commands/explain.c475
-rw-r--r--src/backend/commands/extension.c368
-rw-r--r--src/backend/commands/foreigncmds.c163
-rw-r--r--src/backend/commands/functioncmds.c479
-rw-r--r--src/backend/commands/indexcmds.c374
-rw-r--r--src/backend/commands/lockcmds.c14
-rw-r--r--src/backend/commands/matview.c790
-rw-r--r--src/backend/commands/opclasscmds.c588
-rw-r--r--src/backend/commands/operatorcmds.c186
-rw-r--r--src/backend/commands/portalcmds.c45
-rw-r--r--src/backend/commands/prepare.c33
-rw-r--r--src/backend/commands/proclang.c184
-rw-r--r--src/backend/commands/schemacmds.c50
-rw-r--r--src/backend/commands/seclabel.c26
-rw-r--r--src/backend/commands/sequence.c368
-rw-r--r--src/backend/commands/tablecmds.c1740
-rw-r--r--src/backend/commands/tablespace.c487
-rw-r--r--src/backend/commands/trigger.c698
-rw-r--r--src/backend/commands/tsearchcmds.c555
-rw-r--r--src/backend/commands/typecmds.c365
-rw-r--r--src/backend/commands/user.c149
-rw-r--r--src/backend/commands/vacuum.c297
-rw-r--r--src/backend/commands/vacuumlazy.c575
-rw-r--r--src/backend/commands/variable.c122
-rw-r--r--src/backend/commands/view.c128
-rw-r--r--src/backend/common.mk2
-rw-r--r--src/backend/executor/execAmi.c7
-rw-r--r--src/backend/executor/execCurrent.c4
-rw-r--r--src/backend/executor/execGrouping.c2
-rw-r--r--src/backend/executor/execJunk.c4
-rw-r--r--src/backend/executor/execMain.c298
-rw-r--r--src/backend/executor/execProcnode.c7
-rw-r--r--src/backend/executor/execQual.c628
-rw-r--r--src/backend/executor/execScan.c6
-rw-r--r--src/backend/executor/execTuples.c92
-rw-r--r--src/backend/executor/execUtils.c70
-rw-r--r--src/backend/executor/functions.c140
-rw-r--r--src/backend/executor/instrument.c26
-rw-r--r--src/backend/executor/nodeAgg.c473
-rw-r--r--src/backend/executor/nodeAppend.c4
-rw-r--r--src/backend/executor/nodeBitmapAnd.c2
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c27
-rw-r--r--src/backend/executor/nodeBitmapIndexscan.c2
-rw-r--r--src/backend/executor/nodeBitmapOr.c2
-rw-r--r--src/backend/executor/nodeCtescan.c39
-rw-r--r--src/backend/executor/nodeForeignscan.c9
-rw-r--r--src/backend/executor/nodeFunctionscan.c509
-rw-r--r--src/backend/executor/nodeGroup.c5
-rw-r--r--src/backend/executor/nodeHash.c23
-rw-r--r--src/backend/executor/nodeHashjoin.c9
-rw-r--r--src/backend/executor/nodeIndexonlyscan.c58
-rw-r--r--src/backend/executor/nodeIndexscan.c8
-rw-r--r--src/backend/executor/nodeLimit.c4
-rw-r--r--src/backend/executor/nodeLockRows.c62
-rw-r--r--src/backend/executor/nodeMaterial.c4
-rw-r--r--src/backend/executor/nodeMergeAppend.c119
-rw-r--r--src/backend/executor/nodeMergejoin.c14
-rw-r--r--src/backend/executor/nodeModifyTable.c395
-rw-r--r--src/backend/executor/nodeNestloop.c2
-rw-r--r--src/backend/executor/nodeRecursiveunion.c4
-rw-r--r--src/backend/executor/nodeResult.c2
-rw-r--r--src/backend/executor/nodeSeqscan.c16
-rw-r--r--src/backend/executor/nodeSetOp.c9
-rw-r--r--src/backend/executor/nodeSort.c2
-rw-r--r--src/backend/executor/nodeSubplan.c49
-rw-r--r--src/backend/executor/nodeSubqueryscan.c4
-rw-r--r--src/backend/executor/nodeTidscan.c4
-rw-r--r--src/backend/executor/nodeUnique.c4
-rw-r--r--src/backend/executor/nodeValuesscan.c4
-rw-r--r--src/backend/executor/nodeWindowAgg.c804
-rw-r--r--src/backend/executor/nodeWorktablescan.c4
-rw-r--r--src/backend/executor/spi.c417
-rw-r--r--src/backend/executor/tstoreReceiver.c4
-rw-r--r--src/backend/foreign/foreign.c58
-rw-r--r--src/backend/lib/Makefile2
-rw-r--r--src/backend/lib/binaryheap.c307
-rw-r--r--src/backend/lib/dllist.c214
-rw-r--r--src/backend/lib/ilist.c114
-rw-r--r--src/backend/lib/stringinfo.c70
-rw-r--r--src/backend/libpq/auth.c379
-rw-r--r--src/backend/libpq/be-fsstubs.c349
-rw-r--r--src/backend/libpq/be-secure.c157
-rw-r--r--src/backend/libpq/crypt.c16
-rw-r--r--src/backend/libpq/hba.c537
-rw-r--r--src/backend/libpq/ip.c17
-rw-r--r--src/backend/libpq/md5.c4
-rw-r--r--src/backend/libpq/pg_hba.conf.sample2
-rw-r--r--src/backend/libpq/pqcomm.c112
-rw-r--r--src/backend/libpq/pqformat.c4
-rw-r--r--src/backend/libpq/pqsignal.c69
-rw-r--r--src/backend/main/main.c122
-rw-r--r--src/backend/nls.mk9
-rw-r--r--src/backend/nodes/bitmapset.c25
-rw-r--r--src/backend/nodes/copyfuncs.c203
-rw-r--r--src/backend/nodes/equalfuncs.c280
-rw-r--r--src/backend/nodes/list.c34
-rw-r--r--src/backend/nodes/makefuncs.c55
-rw-r--r--src/backend/nodes/nodeFuncs.c143
-rw-r--r--src/backend/nodes/nodes.c2
-rw-r--r--src/backend/nodes/outfuncs.c241
-rw-r--r--src/backend/nodes/params.c4
-rw-r--r--src/backend/nodes/print.c3
-rw-r--r--src/backend/nodes/read.c12
-rw-r--r--src/backend/nodes/readfuncs.c98
-rw-r--r--src/backend/nodes/tidbitmap.c26
-rw-r--r--src/backend/nodes/value.c2
-rw-r--r--src/backend/optimizer/README64
-rw-r--r--src/backend/optimizer/geqo/geqo_copy.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_cx.c1
-rw-r--r--src/backend/optimizer/geqo/geqo_eval.c19
-rw-r--r--src/backend/optimizer/geqo/geqo_main.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_misc.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_pool.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_px.c1
-rw-r--r--src/backend/optimizer/geqo/geqo_random.c2
-rw-r--r--src/backend/optimizer/geqo/geqo_selection.c2
-rw-r--r--src/backend/optimizer/path/Makefile2
-rw-r--r--src/backend/optimizer/path/allpaths.c602
-rw-r--r--src/backend/optimizer/path/clausesel.c32
-rw-r--r--src/backend/optimizer/path/costsize.c216
-rw-r--r--src/backend/optimizer/path/equivclass.c250
-rw-r--r--src/backend/optimizer/path/indxpath.c775
-rw-r--r--src/backend/optimizer/path/joinpath.c234
-rw-r--r--src/backend/optimizer/path/joinrels.c163
-rw-r--r--src/backend/optimizer/path/orindxpath.c187
-rw-r--r--src/backend/optimizer/path/pathkeys.c274
-rw-r--r--src/backend/optimizer/path/tidpath.c17
-rw-r--r--src/backend/optimizer/plan/analyzejoins.c57
-rw-r--r--src/backend/optimizer/plan/createplan.c436
-rw-r--r--src/backend/optimizer/plan/initsplan.c743
-rw-r--r--src/backend/optimizer/plan/planagg.c154
-rw-r--r--src/backend/optimizer/plan/planmain.c309
-rw-r--r--src/backend/optimizer/plan/planner.c1032
-rw-r--r--src/backend/optimizer/plan/setrefs.c273
-rw-r--r--src/backend/optimizer/plan/subselect.c395
-rw-r--r--src/backend/optimizer/prep/Makefile2
-rw-r--r--src/backend/optimizer/prep/prepjointree.c565
-rw-r--r--src/backend/optimizer/prep/prepqual.c91
-rw-r--r--src/backend/optimizer/prep/prepsecurity.c470
-rw-r--r--src/backend/optimizer/prep/preptlist.c17
-rw-r--r--src/backend/optimizer/prep/prepunion.c96
-rw-r--r--src/backend/optimizer/util/Makefile4
-rw-r--r--src/backend/optimizer/util/clauses.c594
-rw-r--r--src/backend/optimizer/util/joininfo.c4
-rw-r--r--src/backend/optimizer/util/orclauses.c343
-rw-r--r--src/backend/optimizer/util/pathnode.c398
-rw-r--r--src/backend/optimizer/util/placeholder.c214
-rw-r--r--src/backend/optimizer/util/plancat.c74
-rw-r--r--src/backend/optimizer/util/predtest.c24
-rw-r--r--src/backend/optimizer/util/relnode.c86
-rw-r--r--src/backend/optimizer/util/restrictinfo.c214
-rw-r--r--src/backend/optimizer/util/tlist.c70
-rw-r--r--src/backend/optimizer/util/var.c377
-rw-r--r--src/backend/parser/Makefile23
-rw-r--r--src/backend/parser/analyze.c674
-rw-r--r--src/backend/parser/check_keywords.pl235
-rw-r--r--src/backend/parser/gram.y2251
-rw-r--r--src/backend/parser/keywords.c2
-rw-r--r--src/backend/parser/kwlookup.c4
-rw-r--r--src/backend/parser/parse_agg.c1006
-rw-r--r--src/backend/parser/parse_clause.c840
-rw-r--r--src/backend/parser/parse_coerce.c57
-rw-r--r--src/backend/parser/parse_collate.c462
-rw-r--r--src/backend/parser/parse_cte.c25
-rw-r--r--src/backend/parser/parse_expr.c422
-rw-r--r--src/backend/parser/parse_func.c500
-rw-r--r--src/backend/parser/parse_node.c40
-rw-r--r--src/backend/parser/parse_oper.c19
-rw-r--r--src/backend/parser/parse_param.c40
-rw-r--r--src/backend/parser/parse_relation.c1120
-rw-r--r--src/backend/parser/parse_target.c158
-rw-r--r--src/backend/parser/parse_type.c139
-rw-r--r--src/backend/parser/parse_utilcmd.c364
-rw-r--r--src/backend/parser/parser.c9
-rw-r--r--src/backend/parser/scan.l112
-rw-r--r--src/backend/parser/scansup.c12
-rw-r--r--src/backend/pgxc/cluster/pause.c2
-rw-r--r--src/backend/pgxc/copy/remotecopy.c1
-rw-r--r--src/backend/pgxc/locator/locator.c2
-rw-r--r--src/backend/pgxc/nodemgr/groupmgr.c2
-rw-r--r--src/backend/pgxc/nodemgr/nodemgr.c3
-rw-r--r--src/backend/pgxc/pool/execRemote.c4
-rw-r--r--src/backend/pgxc/pool/poolcomm.c2
-rw-r--r--src/backend/pgxc/pool/poolmgr.c121
-rw-r--r--src/backend/pgxc/pool/poolutils.c1
-rw-r--r--src/backend/pgxc/squeue/squeue.c1
-rw-r--r--src/backend/po/de.po20682
-rw-r--r--src/backend/po/es.po20193
-rw-r--r--src/backend/po/fr.po22948
-rw-r--r--src/backend/po/it.po19668
-rw-r--r--src/backend/po/ja.po27624
-rw-r--r--src/backend/po/pl.po20997
-rw-r--r--src/backend/po/pt_BR.po26658
-rw-r--r--src/backend/po/ru.po22189
-rw-r--r--src/backend/po/tr.po16230
-rw-r--r--src/backend/po/zh_CN.po16292
-rw-r--r--src/backend/po/zh_TW.po5
-rw-r--r--src/backend/port/Makefile6
-rw-r--r--src/backend/port/darwin/system.c2
-rw-r--r--src/backend/port/dynloader/aix.h2
-rw-r--r--src/backend/port/dynloader/cygwin.h2
-rw-r--r--src/backend/port/dynloader/darwin.c2
-rw-r--r--src/backend/port/dynloader/freebsd.c4
-rw-r--r--src/backend/port/dynloader/freebsd.h2
-rw-r--r--src/backend/port/dynloader/hpux.c2
-rw-r--r--src/backend/port/dynloader/hpux.h2
-rw-r--r--src/backend/port/dynloader/irix.c6
-rw-r--r--src/backend/port/dynloader/irix.h46
-rw-r--r--src/backend/port/dynloader/linux.c2
-rw-r--r--src/backend/port/dynloader/linux.h2
-rw-r--r--src/backend/port/dynloader/netbsd.c4
-rw-r--r--src/backend/port/dynloader/netbsd.h2
-rw-r--r--src/backend/port/dynloader/openbsd.c4
-rw-r--r--src/backend/port/dynloader/openbsd.h2
-rw-r--r--src/backend/port/dynloader/osf.h2
-rw-r--r--src/backend/port/dynloader/sco.h2
-rw-r--r--src/backend/port/dynloader/solaris.h2
-rw-r--r--src/backend/port/dynloader/unixware.h2
-rw-r--r--src/backend/port/ipc_test.c335
-rw-r--r--src/backend/port/posix_sema.c4
-rw-r--r--src/backend/port/sysv_sema.c32
-rw-r--r--src/backend/port/sysv_shmem.c245
-rw-r--r--src/backend/port/tas/sunstudio_sparc.s2
-rw-r--r--src/backend/port/tas/sunstudio_x86.s2
-rw-r--r--src/backend/port/unix_latch.c283
-rw-r--r--src/backend/port/win32/crashdump.c2
-rw-r--r--src/backend/port/win32/mingwcompat.c2
-rw-r--r--src/backend/port/win32/security.c2
-rw-r--r--src/backend/port/win32/signal.c10
-rw-r--r--src/backend/port/win32/socket.c8
-rw-r--r--src/backend/port/win32/timer.c4
-rw-r--r--src/backend/port/win32_latch.c54
-rw-r--r--src/backend/port/win32_sema.c2
-rw-r--r--src/backend/port/win32_shmem.c20
-rw-r--r--src/backend/postmaster/Makefile4
-rw-r--r--src/backend/postmaster/autovacuum.c401
-rw-r--r--src/backend/postmaster/bgworker.c1014
-rw-r--r--src/backend/postmaster/bgwriter.c99
-rw-r--r--src/backend/postmaster/checkpointer.c112
-rw-r--r--src/backend/postmaster/fork_process.c18
-rw-r--r--src/backend/postmaster/pgarch.c46
-rw-r--r--src/backend/postmaster/pgstat.c1230
-rw-r--r--src/backend/postmaster/postmaster.c1663
-rw-r--r--src/backend/postmaster/startup.c28
-rw-r--r--src/backend/postmaster/syslogger.c133
-rw-r--r--src/backend/postmaster/walwriter.c16
-rw-r--r--src/backend/regex/Makefile2
-rw-r--r--src/backend/regex/README11
-rw-r--r--src/backend/regex/regc_color.c21
-rw-r--r--src/backend/regex/regc_cvec.c2
-rw-r--r--src/backend/regex/regc_lex.c6
-rw-r--r--src/backend/regex/regc_locale.c6
-rw-r--r--src/backend/regex/regc_nfa.c362
-rw-r--r--src/backend/regex/regc_pg_locale.c14
-rw-r--r--src/backend/regex/regcomp.c70
-rw-r--r--src/backend/regex/rege_dfa.c13
-rw-r--r--src/backend/regex/regerror.c8
-rw-r--r--src/backend/regex/regexec.c43
-rw-r--r--src/backend/regex/regexport.c292
-rw-r--r--src/backend/regex/regfree.c2
-rw-r--r--src/backend/regex/regprefix.c259
-rw-r--r--src/backend/replication/.gitignore1
-rw-r--r--src/backend/replication/Makefile20
-rw-r--r--src/backend/replication/README11
-rw-r--r--src/backend/replication/basebackup.c780
-rw-r--r--src/backend/replication/libpqwalreceiver/libpqwalreceiver.c241
-rw-r--r--src/backend/replication/logical/Makefile19
-rw-r--r--src/backend/replication/logical/decode.c846
-rw-r--r--src/backend/replication/logical/logical.c934
-rw-r--r--src/backend/replication/logical/logicalfuncs.c514
-rw-r--r--src/backend/replication/logical/reorderbuffer.c3047
-rw-r--r--src/backend/replication/logical/snapbuild.c1887
-rw-r--r--src/backend/replication/repl_gram.y191
-rw-r--r--src/backend/replication/repl_scanner.l86
-rw-r--r--src/backend/replication/slot.c1233
-rw-r--r--src/backend/replication/slotfuncs.c272
-rw-r--r--src/backend/replication/syncrep.c50
-rw-r--r--src/backend/replication/walreceiver.c748
-rw-r--r--src/backend/replication/walreceiverfuncs.c117
-rw-r--r--src/backend/replication/walsender.c2358
-rw-r--r--src/backend/rewrite/rewriteDefine.c288
-rw-r--r--src/backend/rewrite/rewriteHandler.c1463
-rw-r--r--src/backend/rewrite/rewriteManip.c307
-rw-r--r--src/backend/rewrite/rewriteRemove.c5
-rw-r--r--src/backend/rewrite/rewriteSupport.c18
-rw-r--r--src/backend/snowball/Makefile4
-rw-r--r--src/backend/snowball/dict_snowball.c12
-rw-r--r--src/backend/storage/buffer/README17
-rw-r--r--src/backend/storage/buffer/buf_init.c6
-rw-r--r--src/backend/storage/buffer/buf_table.c6
-rw-r--r--src/backend/storage/buffer/bufmgr.c461
-rw-r--r--src/backend/storage/buffer/freelist.c17
-rw-r--r--src/backend/storage/buffer/localbuf.c37
-rw-r--r--src/backend/storage/file/buffile.c31
-rw-r--r--src/backend/storage/file/copydir.c90
-rw-r--r--src/backend/storage/file/fd.c569
-rw-r--r--src/backend/storage/file/reinit.c4
-rw-r--r--src/backend/storage/freespace/README4
-rw-r--r--src/backend/storage/freespace/freespace.c25
-rw-r--r--src/backend/storage/freespace/fsmpage.c8
-rw-r--r--src/backend/storage/freespace/indexfsm.c2
-rw-r--r--src/backend/storage/ipc/Makefile4
-rw-r--r--src/backend/storage/ipc/dsm.c1010
-rw-r--r--src/backend/storage/ipc/dsm_impl.c1036
-rw-r--r--src/backend/storage/ipc/ipc.c127
-rw-r--r--src/backend/storage/ipc/ipci.c29
-rw-r--r--src/backend/storage/ipc/pmsignal.c8
-rw-r--r--src/backend/storage/ipc/procarray.c563
-rw-r--r--src/backend/storage/ipc/procsignal.c23
-rw-r--r--src/backend/storage/ipc/shm_mq.c1001
-rw-r--r--src/backend/storage/ipc/shm_toc.c246
-rw-r--r--src/backend/storage/ipc/shmem.c53
-rw-r--r--src/backend/storage/ipc/shmqueue.c4
-rw-r--r--src/backend/storage/ipc/sinval.c36
-rw-r--r--src/backend/storage/ipc/sinvaladt.c58
-rw-r--r--src/backend/storage/ipc/standby.c230
-rw-r--r--src/backend/storage/large_object/inv_api.c250
-rw-r--r--src/backend/storage/lmgr/README61
-rw-r--r--src/backend/storage/lmgr/deadlock.c68
-rw-r--r--src/backend/storage/lmgr/lmgr.c197
-rw-r--r--src/backend/storage/lmgr/lock.c621
-rw-r--r--src/backend/storage/lmgr/lwlock.c688
-rw-r--r--src/backend/storage/lmgr/predicate.c141
-rw-r--r--src/backend/storage/lmgr/proc.c746
-rw-r--r--src/backend/storage/lmgr/s_lock.c59
-rw-r--r--src/backend/storage/lmgr/spin.c49
-rw-r--r--src/backend/storage/page/Makefile5
-rw-r--r--src/backend/storage/page/README63
-rw-r--r--src/backend/storage/page/bufpage.c186
-rw-r--r--src/backend/storage/page/checksum.c23
-rw-r--r--src/backend/storage/page/itemptr.c2
-rw-r--r--src/backend/storage/smgr/README39
-rw-r--r--src/backend/storage/smgr/md.c592
-rw-r--r--src/backend/storage/smgr/smgr.c221
-rw-r--r--src/backend/storage/smgr/smgrtype.c2
-rw-r--r--src/backend/tcop/dest.c9
-rw-r--r--src/backend/tcop/fastpath.c16
-rw-r--r--src/backend/tcop/postgres.c272
-rw-r--r--src/backend/tcop/pquery.c60
-rw-r--r--src/backend/tcop/utility.c2743
-rw-r--r--src/backend/tsearch/Makefile2
-rw-r--r--src/backend/tsearch/dict.c2
-rw-r--r--src/backend/tsearch/dict_ispell.c11
-rw-r--r--src/backend/tsearch/dict_simple.c2
-rw-r--r--src/backend/tsearch/dict_synonym.c2
-rw-r--r--src/backend/tsearch/dict_thesaurus.c4
-rw-r--r--src/backend/tsearch/regis.c2
-rw-r--r--src/backend/tsearch/spell.c6
-rw-r--r--src/backend/tsearch/to_tsany.c23
-rw-r--r--src/backend/tsearch/ts_locale.c11
-rw-r--r--src/backend/tsearch/ts_parse.c4
-rw-r--r--src/backend/tsearch/ts_selfuncs.c47
-rw-r--r--src/backend/tsearch/ts_typanalyze.c16
-rw-r--r--src/backend/tsearch/ts_utils.c20
-rw-r--r--src/backend/tsearch/wparser.c2
-rw-r--r--src/backend/tsearch/wparser_def.c50
-rw-r--r--src/backend/utils/Gen_dummy_probes.sed2
-rw-r--r--src/backend/utils/Gen_fmgrtab.pl115
-rw-r--r--src/backend/utils/adt/Makefile28
-rw-r--r--src/backend/utils/adt/acl.c77
-rw-r--r--src/backend/utils/adt/array_selfuncs.c40
-rw-r--r--src/backend/utils/adt/array_typanalyze.c28
-rw-r--r--src/backend/utils/adt/array_userfuncs.c6
-rw-r--r--src/backend/utils/adt/arrayfuncs.c364
-rw-r--r--src/backend/utils/adt/arrayutils.c4
-rw-r--r--src/backend/utils/adt/ascii.c2
-rw-r--r--src/backend/utils/adt/bool.c111
-rw-r--r--src/backend/utils/adt/cash.c114
-rw-r--r--src/backend/utils/adt/char.c4
-rw-r--r--src/backend/utils/adt/date.c100
-rw-r--r--src/backend/utils/adt/datetime.c152
-rw-r--r--src/backend/utils/adt/datum.c4
-rw-r--r--src/backend/utils/adt/dbsize.c96
-rw-r--r--src/backend/utils/adt/domains.c61
-rw-r--r--src/backend/utils/adt/encode.c2
-rw-r--r--src/backend/utils/adt/enum.c3
-rw-r--r--src/backend/utils/adt/float.c163
-rw-r--r--src/backend/utils/adt/format_type.c64
-rw-r--r--src/backend/utils/adt/formatting.c422
-rw-r--r--src/backend/utils/adt/genfile.c3
-rw-r--r--src/backend/utils/adt/geo_ops.c312
-rw-r--r--src/backend/utils/adt/geo_selfuncs.c6
-rw-r--r--src/backend/utils/adt/inet_cidr_ntop.c2
-rw-r--r--src/backend/utils/adt/int.c165
-rw-r--r--src/backend/utils/adt/int8.c205
-rw-r--r--src/backend/utils/adt/json.c2086
-rw-r--r--src/backend/utils/adt/jsonb.c464
-rw-r--r--src/backend/utils/adt/jsonb_gin.c626
-rw-r--r--src/backend/utils/adt/jsonb_op.c294
-rw-r--r--src/backend/utils/adt/jsonb_util.c1587
-rw-r--r--src/backend/utils/adt/jsonfuncs.c2987
-rw-r--r--src/backend/utils/adt/like.c6
-rw-r--r--src/backend/utils/adt/like_match.c4
-rw-r--r--src/backend/utils/adt/lockfuncs.c3
-rw-r--r--src/backend/utils/adt/misc.c162
-rw-r--r--src/backend/utils/adt/nabstime.c58
-rw-r--r--src/backend/utils/adt/name.c2
-rw-r--r--src/backend/utils/adt/network.c147
-rw-r--r--src/backend/utils/adt/network_gist.c789
-rw-r--r--src/backend/utils/adt/network_selfuncs.c32
-rw-r--r--src/backend/utils/adt/numeric.c1012
-rw-r--r--src/backend/utils/adt/numutils.c2
-rw-r--r--src/backend/utils/adt/oid.c4
-rw-r--r--src/backend/utils/adt/oracle_compat.c27
-rw-r--r--src/backend/utils/adt/orderedsetaggs.c1380
-rw-r--r--src/backend/utils/adt/pg_locale.c150
-rw-r--r--src/backend/utils/adt/pg_lsn.c206
-rw-r--r--src/backend/utils/adt/pg_lzcompress.c105
-rw-r--r--src/backend/utils/adt/pgstatfuncs.c139
-rw-r--r--src/backend/utils/adt/pseudotypes.c36
-rw-r--r--src/backend/utils/adt/quote.c2
-rw-r--r--src/backend/utils/adt/rangetypes.c379
-rw-r--r--src/backend/utils/adt/rangetypes_gist.c192
-rw-r--r--src/backend/utils/adt/rangetypes_selfuncs.c1146
-rw-r--r--src/backend/utils/adt/rangetypes_spgist.c896
-rw-r--r--src/backend/utils/adt/rangetypes_typanalyze.c355
-rw-r--r--src/backend/utils/adt/regexp.c99
-rw-r--r--src/backend/utils/adt/regproc.c268
-rw-r--r--src/backend/utils/adt/ri_triggers.c2222
-rw-r--r--src/backend/utils/adt/rowtypes.c602
-rw-r--r--src/backend/utils/adt/ruleutils.c3256
-rw-r--r--src/backend/utils/adt/selfuncs.c1073
-rw-r--r--src/backend/utils/adt/tid.c14
-rw-r--r--src/backend/utils/adt/timestamp.c602
-rw-r--r--src/backend/utils/adt/trigfuncs.c3
-rw-r--r--src/backend/utils/adt/tsginidx.c115
-rw-r--r--src/backend/utils/adt/tsgistidx.c80
-rw-r--r--src/backend/utils/adt/tsquery.c12
-rw-r--r--src/backend/utils/adt/tsquery_cleanup.c2
-rw-r--r--src/backend/utils/adt/tsquery_gist.c10
-rw-r--r--src/backend/utils/adt/tsquery_op.c2
-rw-r--r--src/backend/utils/adt/tsquery_rewrite.c4
-rw-r--r--src/backend/utils/adt/tsquery_util.c7
-rw-r--r--src/backend/utils/adt/tsrank.c41
-rw-r--r--src/backend/utils/adt/tsvector.c7
-rw-r--r--src/backend/utils/adt/tsvector_op.c12
-rw-r--r--src/backend/utils/adt/tsvector_parser.c4
-rw-r--r--src/backend/utils/adt/txid.c84
-rw-r--r--src/backend/utils/adt/uuid.c2
-rw-r--r--src/backend/utils/adt/varbit.c36
-rw-r--r--src/backend/utils/adt/varchar.c6
-rw-r--r--src/backend/utils/adt/varlena.c736
-rw-r--r--src/backend/utils/adt/version.c2
-rw-r--r--src/backend/utils/adt/windowfuncs.c4
-rw-r--r--src/backend/utils/adt/xid.c2
-rw-r--r--src/backend/utils/adt/xml.c233
-rw-r--r--src/backend/utils/cache/Makefile5
-rw-r--r--src/backend/utils/cache/attoptcache.c5
-rw-r--r--src/backend/utils/cache/catcache.c220
-rw-r--r--src/backend/utils/cache/evtcache.c270
-rw-r--r--src/backend/utils/cache/inval.c144
-rw-r--r--src/backend/utils/cache/lsyscache.c33
-rw-r--r--src/backend/utils/cache/plancache.c400
-rw-r--r--src/backend/utils/cache/relcache.c1003
-rw-r--r--src/backend/utils/cache/relfilenodemap.c261
-rw-r--r--src/backend/utils/cache/relmapper.c110
-rw-r--r--src/backend/utils/cache/spccache.c9
-rw-r--r--src/backend/utils/cache/syscache.c233
-rw-r--r--src/backend/utils/cache/ts_cache.c15
-rw-r--r--src/backend/utils/cache/typcache.c26
-rw-r--r--src/backend/utils/errcodes.txt2
-rw-r--r--src/backend/utils/error/assert.c2
-rw-r--r--src/backend/utils/error/elog.c924
-rw-r--r--src/backend/utils/fmgr/README2
-rw-r--r--src/backend/utils/fmgr/dfmgr.c22
-rw-r--r--src/backend/utils/fmgr/fmgr.c141
-rw-r--r--src/backend/utils/fmgr/funcapi.c16
-rw-r--r--src/backend/utils/generate-errcodes.pl37
-rw-r--r--src/backend/utils/hash/dynahash.c276
-rw-r--r--src/backend/utils/hash/hashfn.c2
-rw-r--r--src/backend/utils/init/globals.c25
-rw-r--r--src/backend/utils/init/miscinit.c283
-rw-r--r--src/backend/utils/init/postinit.c220
-rw-r--r--src/backend/utils/mb/Unicode/Makefile2
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_BIG5.pl134
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl78
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl281
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl174
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl78
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl99
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_GB18030.pl78
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl213
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SJIS.pl125
-rw-r--r--src/backend/utils/mb/Unicode/UCS_to_most.pl144
-rw-r--r--src/backend/utils/mb/Unicode/ucs2utf.pl40
-rw-r--r--src/backend/utils/mb/conv.c17
-rw-r--r--src/backend/utils/mb/conversion_procs/Makefile4
-rw-r--r--src/backend/utils/mb/conversion_procs/ascii_and_mic/ascii_and_mic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_tw_and_big5/big5.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c19
-rw-r--r--src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c2
-rw-r--r--src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c2
-rw-r--r--src/backend/utils/mb/encnames.c66
-rw-r--r--src/backend/utils/mb/mbutils.c372
-rw-r--r--src/backend/utils/mb/wchar.c282
-rw-r--r--src/backend/utils/mb/wstrcmp.c2
-rw-r--r--src/backend/utils/mb/wstrncmp.c2
-rw-r--r--src/backend/utils/misc/Makefile11
-rw-r--r--src/backend/utils/misc/guc-file.l243
-rw-r--r--src/backend/utils/misc/guc.c1105
-rw-r--r--src/backend/utils/misc/help_config.c6
-rw-r--r--src/backend/utils/misc/pg_rusage.c2
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample68
-rw-r--r--src/backend/utils/misc/ps_status.c8
-rw-r--r--src/backend/utils/misc/rbtree.c14
-rw-r--r--src/backend/utils/misc/superuser.c3
-rw-r--r--src/backend/utils/misc/timeout.c690
-rw-r--r--src/backend/utils/misc/tzparser.c6
-rw-r--r--src/backend/utils/mmgr/README6
-rw-r--r--src/backend/utils/mmgr/aset.c281
-rw-r--r--src/backend/utils/mmgr/mcxt.c266
-rw-r--r--src/backend/utils/mmgr/portalmem.c39
-rw-r--r--src/backend/utils/probes.d19
-rw-r--r--src/backend/utils/resowner/resowner.c221
-rw-r--r--src/backend/utils/sort/gen_qsort_tuple.pl12
-rw-r--r--src/backend/utils/sort/logtape.c32
-rw-r--r--src/backend/utils/sort/sortsupport.c49
-rw-r--r--src/backend/utils/sort/tuplesort.c306
-rw-r--r--src/backend/utils/sort/tuplestore.c251
-rw-r--r--src/backend/utils/time/combocid.c27
-rw-r--r--src/backend/utils/time/snapmgr.c205
-rw-r--r--src/backend/utils/time/tqual.c977
711 files changed, 252214 insertions, 117889 deletions
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 611d29fa87..3d5d3eae30 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -2,7 +2,7 @@
#
# Makefile for the postgres backend
#
-# Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+# Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/backend/Makefile
@@ -51,10 +51,12 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
$(top_builddir)/src/port/libpgport_srv.a \
$(top_builddir)/src/gtm/client/libgtmclient.a \
$(top_builddir)/src/gtm/common/libgtm.a \
- $(top_builddir)/src/gtm/libpq/libpqcomm.a
+ $(top_builddir)/src/gtm/libpq/libpqcomm.a \
+ $(top_builddir)/src/common/libpgcommon_srv.a
-# We put libpgport into OBJS, so remove it from LIBS; also add libldap
-LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
+# We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add
+# libldap
+LIBS := $(filter-out -lpgport -lpgcommon, $(LIBS)) $(LDAP_LIBS_BE)
# The backend doesn't need everything that's in LIBS, however
LIBS := $(filter-out -lz -lreadline -ledit -ltermcap -lncurses -lcurses, $(LIBS))
@@ -78,36 +80,26 @@ endif
ifeq ($(PORTNAME), cygwin)
-postgres: $(OBJS) postgres.def libpostgres.a
- $(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(LIBS)
- $(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack,$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(LIBS)
- rm -f $@.exp $@.base
+postgres: $(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) -Wl,--stack,$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$^) $(LIBS) -o $@
-postgres.def: $(OBJS)
- $(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^)
+# There is no correct way to write a rule that generates two files.
+# Rules with two targets don't have that meaning, they are merely
+# shorthand for two otherwise separate rules. To be safe for parallel
+# make, we must chain the dependencies like this. The semicolon is
+# important, otherwise make will choose some built-in rule.
-libpostgres.a: postgres.def
- $(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@
+libpostgres.a: postgres ;
endif # cygwin
ifeq ($(PORTNAME), win32)
LIBS += -lsecur32
-postgres: $(OBJS) postgres.def libpostgres.a $(WIN32RES)
- $(DLLTOOL) --dllname $@$(X) --output-exp $@.exp --def postgres.def
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -o $@$(X) -Wl,--base-file,$@.base $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS)
- $(DLLTOOL) --dllname $@$(X) --base-file $@.base --output-exp $@.exp --def postgres.def
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -o $@$(X) $@.exp $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS)
- rm -f $@.exp $@.base
-
-postgres.def: $(OBJS)
- $(DLLTOOL) --export-all --output-def $@ $(call expand_subsys,$^)
+postgres: $(OBJS) $(WIN32RES)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -Wl,--stack=$(WIN32_STACK_RLIMIT) -Wl,--export-all-symbols -Wl,--out-implib=libpostgres.a $(call expand_subsys,$(OBJS)) $(WIN32RES) $(LIBS) -o $@$(X)
-libpostgres.a: postgres.def
- $(DLLTOOL) --dllname postgres.exe --def postgres.def --output-lib $@
+libpostgres.a: postgres ;
endif # win32
@@ -253,6 +245,7 @@ else
endif
ifeq ($(MAKE_EXPORTS), true)
$(INSTALL_DATA) $(POSTGRES_IMP) '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)'
+ $(INSTALL_PROGRAM) $(MKLDEXPORT) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh'
endif
.PHONY: install-bin
@@ -271,6 +264,7 @@ endif
endif
ifeq ($(MAKE_EXPORTS), true)
$(MKDIR_P) '$(DESTDIR)$(pkglibdir)'
+ $(MKDIR_P) '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)'
endif
@@ -280,6 +274,7 @@ uninstall:
rm -f '$(DESTDIR)$(bindir)/postgres$(X)' '$(DESTDIR)$(bindir)/postmaster'
ifeq ($(MAKE_EXPORTS), true)
rm -f '$(DESTDIR)$(pkglibdir)/$(POSTGRES_IMP)'
+ rm -f '$(DESTDIR)$(pgxsdir)/$(MKLDEXPORT_DIR)/mkldexport.sh'
endif
ifeq ($(PORTNAME), cygwin)
ifeq ($(MAKE_DLL), true)
@@ -308,10 +303,10 @@ clean:
$(top_builddir)/src/include/utils/fmgroids.h \
$(top_builddir)/src/include/utils/probes.h
ifeq ($(PORTNAME), cygwin)
- rm -f postgres.dll postgres.def libpostgres.a
+ rm -f postgres.dll libpostgres.a
endif
ifeq ($(PORTNAME), win32)
- rm -f postgres.dll postgres.def libpostgres.a $(WIN32RES)
+ rm -f postgres.dll libpostgres.a $(WIN32RES)
endif
distclean: clean
@@ -329,7 +324,6 @@ maintainer-clean: distclean
catalog/postgres.description \
catalog/postgres.shdescription \
replication/repl_gram.c \
- replication/repl_gram.h \
replication/repl_scanner.c \
utils/fmgroids.h \
utils/fmgrtab.c \
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index 0366d59624..c32088f81d 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/access
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = common gist hash heap index nbtree transam gin spgist
+SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 62479c04bb..b81bddf3c1 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -21,7 +21,7 @@
* tuptoaster.c.
*
* This change will break any code that assumes it needn't detoast values
- * that have been put into a tuple but never sent to disk. Hopefully there
+ * that have been put into a tuple but never sent to disk. Hopefully there
* are few such places.
*
* Varlenas still have alignment 'i' (or 'd') in pg_type/pg_attribute, since
@@ -50,7 +50,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@@ -69,6 +69,7 @@
#include "access/tuptoaster.h"
#include "executor/tuptable.h"
#ifdef XCP
+#include "lib/stringinfo.h"
#include "utils/memutils.h"
#endif
@@ -400,7 +401,7 @@ nocachegetattr(HeapTuple tuple,
/*
* Otherwise, check for non-fixed-length attrs up to and including
- * target. If there aren't any, it's safe to cheaply initialize the
+ * target. If there aren't any, it's safe to cheaply initialize the
* cached offsets for these attrs.
*/
if (HeapTupleHasVarWidth(tuple))
@@ -467,7 +468,7 @@ nocachegetattr(HeapTuple tuple,
*
* Note - This loop is a little tricky. For each non-null attribute,
* we have to first account for alignment padding before the attr,
- * then advance over the attr based on its length. Nulls have no
+ * then advance over the attr based on its length. Nulls have no
* storage and no alignment padding either. We can use/set
* attcacheoff until we reach either a null or a var-width attribute.
*/
@@ -552,17 +553,17 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
result = ObjectIdGetDatum(HeapTupleGetOid(tup));
break;
case MinTransactionIdAttributeNumber:
- result = TransactionIdGetDatum(HeapTupleHeaderGetXmin(tup->t_data));
+ result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data));
break;
case MaxTransactionIdAttributeNumber:
- result = TransactionIdGetDatum(HeapTupleHeaderGetXmax(tup->t_data));
+ result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data));
break;
case MinCommandIdAttributeNumber:
case MaxCommandIdAttributeNumber:
/*
* cmin and cmax are now both aliases for the same field, which
- * can in fact also be a combo command id. XXX perhaps we should
+ * can in fact also be a combo command id. XXX perhaps we should
* return the "real" cmin or cmax if possible, that is if we are
* inside the originating transaction?
*/
@@ -641,6 +642,41 @@ heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest)
memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len);
}
+/* ----------------
+ * heap_copy_tuple_as_datum
+ *
+ * copy a tuple as a composite-type Datum
+ * ----------------
+ */
+Datum
+heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc)
+{
+ HeapTupleHeader td;
+
+ /*
+ * If the tuple contains any external TOAST pointers, we have to inline
+ * those fields to meet the conventions for composite-type Datums.
+ */
+ if (HeapTupleHasExternal(tuple))
+ return toast_flatten_tuple_to_datum(tuple->t_data,
+ tuple->t_len,
+ tupleDesc);
+
+ /*
+ * Fast path for easy case: just make a palloc'd copy and insert the
+ * correct composite-Datum header fields (since those may not be set if
+ * the given tuple came from disk, rather than from heap_form_tuple).
+ */
+ td = (HeapTupleHeader) palloc(tuple->t_len);
+ memcpy((char *) td, (char *) tuple->t_data, tuple->t_len);
+
+ HeapTupleHeaderSetDatumLength(td, tuple->t_len);
+ HeapTupleHeaderSetTypeId(td, tupleDesc->tdtypeid);
+ HeapTupleHeaderSetTypMod(td, tupleDesc->tdtypmod);
+
+ return PointerGetDatum(td);
+}
+
/*
* heap_form_tuple
* construct a tuple from the given values[] and isnull[] arrays,
@@ -659,7 +695,6 @@ heap_form_tuple(TupleDesc tupleDescriptor,
data_len;
int hoff;
bool hasnull = false;
- Form_pg_attribute *att = tupleDescriptor->attrs;
int numberOfAttributes = tupleDescriptor->natts;
int i;
@@ -670,28 +705,14 @@ heap_form_tuple(TupleDesc tupleDescriptor,
numberOfAttributes, MaxTupleAttributeNumber)));
/*
- * Check for nulls and embedded tuples; expand any toasted attributes in
- * embedded tuples. This preserves the invariant that toasting can only
- * go one level deep.
- *
- * We can skip calling toast_flatten_tuple_attribute() if the attribute
- * couldn't possibly be of composite type. All composite datums are
- * varlena and have alignment 'd'; furthermore they aren't arrays. Also,
- * if an attribute is already toasted, it must have been sent to disk
- * already and so cannot contain toasted attributes.
+ * Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
- hasnull = true;
- else if (att[i]->attlen == -1 &&
- att[i]->attalign == 'd' &&
- att[i]->attndims == 0 &&
- !VARATT_IS_EXTENDED(DatumGetPointer(values[i])))
{
- values[i] = toast_flatten_tuple_attribute(values[i],
- att[i]->atttypid,
- att[i]->atttypmod);
+ hasnull = true;
+ break;
}
}
@@ -713,7 +734,7 @@ heap_form_tuple(TupleDesc tupleDescriptor,
len += data_len;
/*
- * Allocate and zero the space needed. Note that the tuple body and
+ * Allocate and zero the space needed. Note that the tuple body and
* HeapTupleData management structure are allocated in one chunk.
*/
tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
@@ -721,7 +742,8 @@ heap_form_tuple(TupleDesc tupleDescriptor,
/*
* And fill in the information. Note we fill the Datum fields even though
- * this tuple may never become a Datum.
+ * this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum
+ * identify the tuple type if needed.
*/
tuple->t_len = len;
ItemPointerSetInvalid(&(tuple->t_self));
@@ -1641,7 +1663,6 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
data_len;
int hoff;
bool hasnull = false;
- Form_pg_attribute *att = tupleDescriptor->attrs;
int numberOfAttributes = tupleDescriptor->natts;
int i;
@@ -1652,28 +1673,14 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
numberOfAttributes, MaxTupleAttributeNumber)));
/*
- * Check for nulls and embedded tuples; expand any toasted attributes in
- * embedded tuples. This preserves the invariant that toasting can only
- * go one level deep.
- *
- * We can skip calling toast_flatten_tuple_attribute() if the attribute
- * couldn't possibly be of composite type. All composite datums are
- * varlena and have alignment 'd'; furthermore they aren't arrays. Also,
- * if an attribute is already toasted, it must have been sent to disk
- * already and so cannot contain toasted attributes.
+ * Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
- hasnull = true;
- else if (att[i]->attlen == -1 &&
- att[i]->attalign == 'd' &&
- att[i]->attndims == 0 &&
- !VARATT_IS_EXTENDED(values[i]))
{
- values[i] = toast_flatten_tuple_attribute(values[i],
- att[i]->atttypid,
- att[i]->atttypmod);
+ hasnull = true;
+ break;
}
}
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index 76c76e9e89..5fd400990b 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -4,7 +4,7 @@
* This file contains index tuple accessor and mutator routines,
* as well as various tuple utilities.
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@@ -71,7 +71,7 @@ index_form_tuple(TupleDesc tupleDescriptor,
/*
* If value is stored EXTERNAL, must fetch it so we are not depending
- * on outside storage. This should be improved someday.
+ * on outside storage. This should be improved someday.
*/
if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i])))
{
@@ -158,6 +158,11 @@ index_form_tuple(TupleDesc tupleDescriptor,
if (tupmask & HEAP_HASVARWIDTH)
infomask |= INDEX_VAR_MASK;
+ /* Also assert we got rid of external attributes */
+#ifdef TOAST_INDEX_HACK
+ Assert((tupmask & HEAP_HASEXTERNAL) == 0);
+#endif
+
/*
* Here we make sure that the size will fit in the field reserved for it
* in t_info.
@@ -165,9 +170,8 @@ index_form_tuple(TupleDesc tupleDescriptor,
if ((size & INDEX_SIZE_MASK) != size)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("index row requires %lu bytes, maximum size is %lu",
- (unsigned long) size,
- (unsigned long) INDEX_SIZE_MASK)));
+ errmsg("index row requires %zu bytes, maximum size is %zu",
+ size, (Size) INDEX_SIZE_MASK)));
infomask |= size;
@@ -276,7 +280,7 @@ nocache_index_getattr(IndexTuple tup,
/*
* Otherwise, check for non-fixed-length attrs up to and including
- * target. If there aren't any, it's safe to cheaply initialize the
+ * target. If there aren't any, it's safe to cheaply initialize the
* cached offsets for these attrs.
*/
if (IndexTupleHasVarwidths(tup))
@@ -343,7 +347,7 @@ nocache_index_getattr(IndexTuple tup,
*
* Note - This loop is a little tricky. For each non-null attribute,
* we have to first account for alignment padding before the attr,
- * then advance over the attr based on its length. Nulls have no
+ * then advance over the attr based on its length. Nulls have no
* storage and no alignment padding either. We can use/set
* attcacheoff until we reach either a null or a var-width attribute.
*/
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 10a45a3146..d033cdd09a 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -10,7 +10,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -28,6 +28,9 @@
#ifdef PGXC
#include "pgxc/pgxc.h"
#endif
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
static void printtup_startup(DestReceiver *self, int operation,
TupleDesc typeinfo);
@@ -67,6 +70,7 @@ typedef struct
TupleDesc attrinfo; /* The attr info we are set up for */
int nattrs;
PrinttupAttrInfo *myinfo; /* Cached info about each attr */
+ MemoryContext tmpcontext; /* Memory context for per-row workspace */
} DR_printtup;
/* ----------------
@@ -93,6 +97,7 @@ printtup_create_DR(CommandDest dest)
self->attrinfo = NULL;
self->nattrs = 0;
self->myinfo = NULL;
+ self->tmpcontext = NULL;
return (DestReceiver *) self;
}
@@ -130,6 +135,18 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
DR_printtup *myState = (DR_printtup *) self;
Portal portal = myState->portal;
+ /*
+ * Create a temporary memory context that we can reset once per row to
+ * recover palloc'd memory. This avoids any problems with leaks inside
+ * datatype output routines, and should be faster than retail pfree's
+ * anyway.
+ */
+ myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "printtup",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3)
{
/*
@@ -173,7 +190,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
* or some similar function; it does not contain a full set of fields.
* The targetlist will be NIL when executing a utility function that does
* not have a plan. If the targetlist isn't NIL then it is a Query node's
- * targetlist; it is up to us to ignore resjunk columns in it. The formats[]
+ * targetlist; it is up to us to ignore resjunk columns in it. The formats[]
* array pointer might be NULL (if we are doing Describe on a prepared stmt);
* send zeroes for the format codes in that case.
*/
@@ -309,6 +326,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
{
TupleDesc typeinfo = slot->tts_tupleDescriptor;
DR_printtup *myState = (DR_printtup *) self;
+ MemoryContext oldcontext;
StringInfoData buf;
int natts = typeinfo->natts;
int i;
@@ -341,8 +359,11 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
/* Make sure the tuple is fully deconstructed */
slot_getallattrs(slot);
+ /* Switch into per-row context so we can recover memory below */
+ oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
/*
- * Prepare a DataRow message
+ * Prepare a DataRow message (note buffer is in per-row context)
*/
pq_beginmessage(&buf, 'D');
@@ -354,8 +375,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
for (i = 0; i < natts; ++i)
{
PrinttupAttrInfo *thisState = myState->myinfo + i;
- Datum origattr = slot->tts_values[i],
- attr;
+ Datum attr = slot->tts_values[i];
if (slot->tts_isnull[i])
{
@@ -364,13 +384,15 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
}
/*
- * If we have a toasted datum, forcibly detoast it here to avoid
- * memory leakage inside the type's output routine.
+ * Here we catch undefined bytes in datums that are returned to the
+ * client without hitting disk; see comments at the related check in
+ * PageAddItem(). This test is most useful for uncompressed,
+ * non-external datums, but we're quite likely to see such here when
+ * testing new C functions.
*/
if (thisState->typisvarlena)
- attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
- else
- attr = origattr;
+ VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr),
+ VARSIZE_ANY(attr));
if (thisState->format == 0)
{
@@ -379,7 +401,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
outputstr = OutputFunctionCall(&thisState->finfo, attr);
pq_sendcountedtext(&buf, outputstr, strlen(outputstr), false);
- pfree(outputstr);
}
else
{
@@ -390,15 +411,14 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4);
pq_sendbytes(&buf, VARDATA(outputbytes),
VARSIZE(outputbytes) - VARHDRSZ);
- pfree(outputbytes);
}
-
- /* Clean up detoasted copy, if any */
- if (DatumGetPointer(attr) != DatumGetPointer(origattr))
- pfree(DatumGetPointer(attr));
}
pq_endmessage(&buf);
+
+ /* Return to caller's context, and flush row's temporary memory */
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextReset(myState->tmpcontext);
}
/* ----------------
@@ -410,6 +430,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
{
TupleDesc typeinfo = slot->tts_tupleDescriptor;
DR_printtup *myState = (DR_printtup *) self;
+ MemoryContext oldcontext;
StringInfoData buf;
int natts = typeinfo->natts;
int i,
@@ -423,6 +444,9 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
/* Make sure the tuple is fully deconstructed */
slot_getallattrs(slot);
+ /* Switch into per-row context so we can recover memory below */
+ oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
/*
* tell the frontend to expect new tuple data (in ASCII style)
*/
@@ -454,8 +478,7 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
for (i = 0; i < natts; ++i)
{
PrinttupAttrInfo *thisState = myState->myinfo + i;
- Datum origattr = slot->tts_values[i],
- attr;
+ Datum attr = slot->tts_values[i];
char *outputstr;
if (slot->tts_isnull[i])
@@ -463,25 +486,15 @@ printtup_20(TupleTableSlot *slot, DestReceiver *self)
Assert(thisState->format == 0);
- /*
- * If we have a toasted datum, forcibly detoast it here to avoid
- * memory leakage inside the type's output routine.
- */
- if (thisState->typisvarlena)
- attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
- else
- attr = origattr;
-
outputstr = OutputFunctionCall(&thisState->finfo, attr);
pq_sendcountedtext(&buf, outputstr, strlen(outputstr), true);
- pfree(outputstr);
-
- /* Clean up detoasted copy, if any */
- if (DatumGetPointer(attr) != DatumGetPointer(origattr))
- pfree(DatumGetPointer(attr));
}
pq_endmessage(&buf);
+
+ /* Return to caller's context, and flush row's temporary memory */
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextReset(myState->tmpcontext);
}
/* ----------------
@@ -498,6 +511,10 @@ printtup_shutdown(DestReceiver *self)
myState->myinfo = NULL;
myState->attrinfo = NULL;
+
+ if (myState->tmpcontext)
+ MemoryContextDelete(myState->tmpcontext);
+ myState->tmpcontext = NULL;
}
/* ----------------
@@ -560,8 +577,7 @@ debugtup(TupleTableSlot *slot, DestReceiver *self)
TupleDesc typeinfo = slot->tts_tupleDescriptor;
int natts = typeinfo->natts;
int i;
- Datum origattr,
- attr;
+ Datum attr;
char *value;
bool isnull;
Oid typoutput;
@@ -569,30 +585,15 @@ debugtup(TupleTableSlot *slot, DestReceiver *self)
for (i = 0; i < natts; ++i)
{
- origattr = slot_getattr(slot, i + 1, &isnull);
+ attr = slot_getattr(slot, i + 1, &isnull);
if (isnull)
continue;
getTypeOutputInfo(typeinfo->attrs[i]->atttypid,
&typoutput, &typisvarlena);
- /*
- * If we have a toasted datum, forcibly detoast it here to avoid
- * memory leakage inside the type's output routine.
- */
- if (typisvarlena)
- attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
- else
- attr = origattr;
-
value = OidOutputFunctionCall(typoutput, attr);
printatt((unsigned) i + 1, typeinfo->attrs[i], value);
-
- pfree(value);
-
- /* Clean up detoasted copy, if any */
- if (DatumGetPointer(attr) != DatumGetPointer(origattr))
- pfree(DatumGetPointer(attr));
}
printf("\t----\n");
}
@@ -611,6 +612,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
{
TupleDesc typeinfo = slot->tts_tupleDescriptor;
DR_printtup *myState = (DR_printtup *) self;
+ MemoryContext oldcontext;
StringInfoData buf;
int natts = typeinfo->natts;
int i,
@@ -624,6 +626,9 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
/* Make sure the tuple is fully deconstructed */
slot_getallattrs(slot);
+ /* Switch into per-row context so we can recover memory below */
+ oldcontext = MemoryContextSwitchTo(myState->tmpcontext);
+
/*
* tell the frontend to expect new tuple data (in binary style)
*/
@@ -655,8 +660,7 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
for (i = 0; i < natts; ++i)
{
PrinttupAttrInfo *thisState = myState->myinfo + i;
- Datum origattr = slot->tts_values[i],
- attr;
+ Datum attr = slot->tts_values[i];
bytea *outputbytes;
if (slot->tts_isnull[i])
@@ -664,26 +668,15 @@ printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
Assert(thisState->format == 1);
- /*
- * If we have a toasted datum, forcibly detoast it here to avoid
- * memory leakage inside the type's output routine.
- */
- if (thisState->typisvarlena)
- attr = PointerGetDatum(PG_DETOAST_DATUM(origattr));
- else
- attr = origattr;
-
outputbytes = SendFunctionCall(&thisState->finfo, attr);
- /* We assume the result will not have been toasted */
pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4);
pq_sendbytes(&buf, VARDATA(outputbytes),
VARSIZE(outputbytes) - VARHDRSZ);
- pfree(outputbytes);
-
- /* Clean up detoasted copy, if any */
- if (DatumGetPointer(attr) != DatumGetPointer(origattr))
- pfree(DatumGetPointer(attr));
}
pq_endmessage(&buf);
+
+ /* Return to caller's context, and flush row's temporary memory */
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextReset(myState->tmpcontext);
}
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 607a80d7d4..522b671993 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -3,7 +3,7 @@
* reloptions.c
* Core support for relation options (pg_class.reloptions)
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@@ -17,12 +17,14 @@
#include "access/gist_private.h"
#include "access/hash.h"
+#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/reloptions.h"
#include "access/spgist.h"
#include "catalog/pg_type.h"
#include "commands/defrem.h"
#include "commands/tablespace.h"
+#include "commands/view.h"
#include "nodes/makefuncs.h"
#include "utils/array.h"
#include "utils/attoptcache.h"
@@ -61,6 +63,14 @@ static relopt_bool boolRelOpts[] =
},
{
{
+ "user_catalog_table",
+ "Declare a table as an additional catalog table, e.g. for the purpose of logical replication",
+ RELOPT_KIND_HEAP
+ },
+ false
+ },
+ {
+ {
"fastupdate",
"Enables \"fast update\" feature for this GIN index",
RELOPT_KIND_GIN
@@ -163,6 +173,14 @@ static relopt_int intRelOpts[] =
},
{
{
+ "autovacuum_multixact_freeze_min_age",
+ "Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum",
+ RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+ },
+ -1, 0, 1000000000
+ },
+ {
+ {
"autovacuum_freeze_max_age",
"Age at which to autovacuum a table to prevent transaction ID wraparound",
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
@@ -171,11 +189,27 @@ static relopt_int intRelOpts[] =
},
{
{
+ "autovacuum_multixact_freeze_max_age",
+ "Multixact age at which to autovacuum a table to prevent multixact wraparound",
+ RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+ },
+ -1, 100000000, 2000000000
+ },
+ {
+ {
"autovacuum_freeze_table_age",
- "Age at which VACUUM should perform a full table sweep to replace old Xid values with FrozenXID",
+ "Age at which VACUUM should perform a full table sweep to freeze row versions",
+ RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
+ }, -1, 0, 2000000000
+ },
+ {
+ {
+ "autovacuum_multixact_freeze_table_age",
+ "Age of multixact at which VACUUM should perform a full table sweep to freeze row versions",
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
}, -1, 0, 2000000000
},
+
/* list terminator */
{{NULL}}
};
@@ -247,6 +281,17 @@ static relopt_string stringRelOpts[] =
gistValidateBufferingOption,
"auto"
},
+ {
+ {
+ "check_option",
+ "View has WITH CHECK OPTION defined (local or cascaded).",
+ RELOPT_KIND_VIEW
+ },
+ 0,
+ true,
+ validateWithCheckOption,
+ NULL
+ },
/* list terminator */
{{NULL}}
};
@@ -495,7 +540,7 @@ add_real_reloption(bits32 kinds, char *name, char *desc, double default_val,
* Add a new string reloption
*
* "validator" is an optional function pointer that can be used to test the
- * validity of the values. It must elog(ERROR) when the argument string is
+ * validity of the values. It must elog(ERROR) when the argument string is
* not acceptable for the variable. Note that the default value must pass
* the validation.
*/
@@ -790,7 +835,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
case RELKIND_RELATION:
case RELKIND_TOASTVALUE:
case RELKIND_VIEW:
- case RELKIND_UNCATALOGED:
+ case RELKIND_MATVIEW:
options = heap_reloptions(classForm->relkind, datum, false);
break;
case RELKIND_INDEX:
@@ -823,7 +868,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
* is returned.
*
* Note: values of type int, bool and real are allocated as part of the
- * returned array. Values of type string are allocated separately and must
+ * returned array. Values of type string are allocated separately and must
* be freed by the caller.
*/
relopt_value *
@@ -1145,12 +1190,22 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_max_age)},
{"autovacuum_freeze_table_age", RELOPT_TYPE_INT,
offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, freeze_table_age)},
+ {"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT,
+ offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_min_age)},
+ {"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT,
+ offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_max_age)},
+ {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT,
+ offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, multixact_freeze_table_age)},
{"autovacuum_vacuum_scale_factor", RELOPT_TYPE_REAL,
offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, vacuum_scale_factor)},
{"autovacuum_analyze_scale_factor", RELOPT_TYPE_REAL,
offsetof(StdRdOptions, autovacuum) +offsetof(AutoVacOpts, analyze_scale_factor)},
{"security_barrier", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, security_barrier)},
+ {"check_option", RELOPT_TYPE_STRING,
+ offsetof(StdRdOptions, check_option_offset)},
+ {"user_catalog_table", RELOPT_TYPE_BOOL,
+ offsetof(StdRdOptions, user_catalog_table)}
};
options = parseRelOptions(reloptions, validate, kind, &numoptions);
@@ -1191,6 +1246,7 @@ heap_reloptions(char relkind, Datum reloptions, bool validate)
}
return (bytea *) rdopts;
case RELKIND_RELATION:
+ case RELKIND_MATVIEW:
return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP);
case RELKIND_VIEW:
return default_reloptions(reloptions, validate, RELOPT_KIND_VIEW);
diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c
index 37d77ed7d5..85c31c49c1 100644
--- a/src/backend/access/common/scankey.c
+++ b/src/backend/access/common/scankey.c
@@ -3,7 +3,7 @@
* scankey.c
* scan key support code
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index f5a43d47b8..2e48b32ba3 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -5,11 +5,11 @@
*
* These functions provide conversion between rowtypes that are logically
* equivalent but might have columns in a different order or different sets
- * of dropped columns. There is some overlap of functionality with the
+ * of dropped columns. There is some overlap of functionality with the
* executor's "junkfilter" routines, but these functions work on bare
* HeapTuples rather than TupleTableSlots.
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@@ -20,6 +20,7 @@
*/
#include "postgres.h"
+#include "access/htup_details.h"
#include "access/tupconvert.h"
#include "utils/builtins.h"
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index daac277f03..f3b36893f7 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -3,7 +3,7 @@
* tupdesc.c
* POSTGRES tuple descriptor support code
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@@ -19,12 +19,13 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/pg_type.h"
#include "miscadmin.h"
#include "parser/parse_type.h"
#include "utils/acl.h"
#include "utils/builtins.h"
-#include "utils/resowner.h"
+#include "utils/resowner_private.h"
#include "utils/syscache.h"
@@ -203,6 +204,7 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc)
if (constr->check[i].ccbin)
cpy->check[i].ccbin = pstrdup(constr->check[i].ccbin);
cpy->check[i].ccvalid = constr->check[i].ccvalid;
+ cpy->check[i].ccnoinherit = constr->check[i].ccnoinherit;
}
}
@@ -216,6 +218,47 @@ CreateTupleDescCopyConstr(TupleDesc tupdesc)
}
/*
+ * TupleDescCopyEntry
+ * This function copies a single attribute structure from one tuple
+ * descriptor to another.
+ *
+ * !!! Constraints and defaults are not copied !!!
+ */
+void
+TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno,
+ TupleDesc src, AttrNumber srcAttno)
+{
+ /*
+ * sanity checks
+ */
+ AssertArg(PointerIsValid(src));
+ AssertArg(PointerIsValid(dst));
+ AssertArg(srcAttno >= 1);
+ AssertArg(srcAttno <= src->natts);
+ AssertArg(dstAttno >= 1);
+ AssertArg(dstAttno <= dst->natts);
+
+ memcpy(dst->attrs[dstAttno - 1], src->attrs[srcAttno - 1],
+ ATTRIBUTE_FIXED_PART_SIZE);
+
+ /*
+ * Aside from updating the attno, we'd better reset attcacheoff.
+ *
+ * XXX Actually, to be entirely safe we'd need to reset the attcacheoff of
+ * all following columns in dst as well. Current usage scenarios don't
+ * require that though, because all following columns will get initialized
+ * by other uses of this function or TupleDescInitEntry. So we cheat a
+ * bit to avoid a useless O(N^2) penalty.
+ */
+ dst->attrs[dstAttno - 1]->attnum = dstAttno;
+ dst->attrs[dstAttno - 1]->attcacheoff = -1;
+
+ /* since we're not copying constraints or defaults, clear these */
+ dst->attrs[dstAttno - 1]->attnotnull = false;
+ dst->attrs[dstAttno - 1]->atthasdef = false;
+}
+
+/*
* Free a TupleDesc including all substructure
*/
void
@@ -416,7 +459,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
for (j = 0; j < n; check2++, j++)
{
if (strcmp(check1->ccname, check2->ccname) == 0 &&
- strcmp(check1->ccbin, check2->ccbin) == 0)
+ strcmp(check1->ccbin, check2->ccbin) == 0 &&
+ check1->ccvalid == check2->ccvalid &&
+ check1->ccnoinherit == check2->ccnoinherit)
break;
}
if (j >= n)
@@ -536,7 +581,7 @@ TupleDescInitEntryCollation(TupleDesc desc,
* Given a relation schema (list of ColumnDef nodes), build a TupleDesc.
*
* Note: the default assumption is no OIDs; caller may modify the returned
- * TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in
+ * TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in
* later on.
*/
TupleDesc
@@ -579,8 +624,7 @@ BuildDescForRelation(List *schema)
aclresult = pg_type_aclcheck(atttypid, GetUserId(), ACL_USAGE);
if (aclresult != ACLCHECK_OK)
- aclcheck_error(aclresult, ACL_KIND_TYPE,
- format_type_be(atttypid));
+ aclcheck_error_type(aclresult, atttypid);
attcollation = GetColumnDefCollation(NULL, entry, atttypid);
attdim = list_length(entry->typeName->arrayBounds);
diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile
index 889dde6a27..db4f496a4d 100644
--- a/src/backend/access/gin/Makefile
+++ b/src/backend/access/gin/Makefile
@@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global
OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \
ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \
- ginbulk.o ginfast.o
+ ginbulk.o ginfast.o ginpostinglist.o ginlogic.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index 67159d8529..fade0cbb61 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -104,7 +104,7 @@ a few thousand entries can be much faster than retail insertion. (The win
comes mainly from not having to do multiple searches/insertions when the
same key appears in multiple new heap tuples.)
-Key entries are nominally of the same IndexEntry format as used in other
+Key entries are nominally of the same IndexTuple format as used in other
index types, but since a leaf key entry typically refers to multiple heap
tuples, there are significant differences. (See GinFormTuple, which works
by building a "normal" index tuple and then modifying it.) The points to
@@ -135,15 +135,15 @@ same category of null entry are merged into one index entry just as happens
with ordinary key entries.
* In a key entry at the btree leaf level, at the next SHORTALIGN boundary,
-there is an array of zero or more ItemPointers, which store the heap tuple
-TIDs for which the indexable items contain this key. This is called the
-"posting list". The TIDs in a posting list must appear in sorted order.
-If the list would be too big for the index tuple to fit on an index page,
-the ItemPointers are pushed out to a separate posting page or pages, and
-none appear in the key entry itself. The separate pages are called a
-"posting tree"; they are organized as a btree of ItemPointer values.
-Note that in either case, the ItemPointers associated with a key can
-easily be read out in sorted order; this is relied on by the scan
+there is a list of item pointers, in compressed format (see Posting List
+Compression section), pointing to the heap tuples for which the indexable
+items contain this key. This is called the "posting list".
+
+If the list would be too big for the index tuple to fit on an index page, the
+ItemPointers are pushed out to a separate posting page or pages, and none
+appear in the key entry itself. The separate pages are called a "posting
+tree" (see below); Note that in either case, the ItemPointers associated with
+a key can easily be read out in sorted order; this is relied on by the scan
algorithms.
* The index tuple header fields of a leaf key entry are abused as follows:
@@ -163,6 +163,11 @@ algorithms.
* The posting list can be accessed with GinGetPosting(itup)
+* If GinITupIsCompressed(itup), the posting list is stored in compressed
+ format. Otherwise it is just an array of ItemPointers. New tuples are always
+ stored in compressed format, uncompressed items can be present if the
+ database was migrated from 9.3 or earlier version.
+
2) Posting tree case:
* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number
@@ -176,7 +181,7 @@ algorithms.
GinSetPostingTree macro.
* If IndexTupleHasNulls(itup) is true, the null category byte can be
- accessed/set with GinGetNullCategory(itup) / GinSetNullCategory(itup,c)
+ accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c)
* The posting list is not present and must not be accessed.
@@ -210,6 +215,145 @@ fit on one pending-list page must have those pages to itself, even if this
results in wasting much of the space on the preceding page and the last
page for the tuple.)
+Posting tree
+------------
+
+If a posting list is too large to store in-line in a key entry, a posting tree
+is created. A posting tree is a B-tree structure, where the ItemPointer is
+used as the key.
+
+Internal posting tree pages use the standard PageHeader and the same "opaque"
+struct as other GIN page, but do not contain regular index tuples. Instead,
+the contents of the page is an array of PostingItem structs. Each PostingItem
+consists of the block number of the child page, and the right bound of that
+child page, as an ItemPointer. The right bound of the page is stored right
+after the page header, before the PostingItem array.
+
+Posting tree leaf pages also use the standard PageHeader and opaque struct,
+and the right bound of the page is stored right after the page header, but
+the page content comprises of a number of compressed posting lists. The
+compressed posting lists are stored one after each other, between page header
+and pd_lower. The space between pd_lower and pd_upper is unused, which allows
+full-page images of posting tree leaf pages to skip the unused space in middle
+(buffer_std = true in XLogRecData).
+
+The item pointers are stored in a number of independent compressed posting
+lists (also called segments), instead of one big one, to make random access
+to a given item pointer faster: to find an item in a compressed list, you
+have to read the list from the beginning, but when the items are split into
+multiple lists, you can first skip over to the list containing the item you're
+looking for, and read only that segment. Also, an update only needs to
+re-encode the affected segment.
+
+Posting List Compression
+------------------------
+
+To fit as many item pointers on a page as possible, posting tree leaf pages
+and posting lists stored inline in entry tree leaf tuples use a lightweight
+form of compression. We take advantage of the fact that the item pointers
+are stored in sorted order. Instead of storing the block and offset number of
+each item pointer separately, we store the difference from the previous item.
+That in itself doesn't do much, but it allows us to use so-called varbyte
+encoding to compress them.
+
+Varbyte encoding is a method to encode integers, allowing smaller numbers to
+take less space at the cost of larger numbers. Each integer is represented by
+variable number of bytes. High bit of each byte in varbyte encoding determines
+whether the next byte is still part of this number. Therefore, to read a single
+varbyte encoded number, you have to read bytes until you find a byte with the
+high bit not set.
+
+When encoding, the block and offset number forming the item pointer are
+combined into a single integer. The offset number is stored in the 11 low
+bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number
+is stored in the higher bits. That requires 43 bits in total, which
+conveniently fits in at most 6 bytes.
+
+A compressed posting list is passed around and stored on disk in a
+PackedPostingList struct. The first item in the list is stored uncompressed
+as a regular ItemPointerData, followed by the length of the list in bytes,
+followed by the packed items.
+
+Concurrency
+-----------
+
+The entry tree and each posting tree is a B-tree, with right-links connecting
+sibling pages at the same level. This is the same structure that is used in
+the regular B-tree indexam (invented by Lehman & Yao), but we don't support
+scanning a GIN trees backwards, so we don't need left-links.
+
+To avoid deadlocks, B-tree pages must always be locked in the same order:
+left to right, and bottom to top. When searching, the tree is traversed from
+top to bottom, so the lock on the parent page must be released before
+descending to the next level. Concurrent page splits move the keyspace to
+right, so after following a downlink, the page actually containing the key
+we're looking for might be somewhere to the right of the page we landed on.
+In that case, we follow the right-links until we find the page we're looking
+for.
+
+To delete a page, the page's left sibling, the target page, and its parent,
+are locked in that order, and the page is marked as deleted. However, a
+concurrent search might already have read a pointer to the page, and might be
+just about to follow it. A page can be reached via the right-link of its left
+sibling, or via its downlink in the parent.
+
+To prevent a backend from reaching a deleted page via a right-link, when
+following a right-link the lock on the previous page is not released until
+the lock on next page has been acquired.
+
+The downlink is more tricky. A search descending the tree must release the
+lock on the parent page before locking the child, or it could deadlock with
+a concurrent split of the child page; a page split locks the parent, while
+already holding a lock on the child page. However, posting trees are only
+fully searched from left to right, starting from the leftmost leaf. (The
+tree-structure is only needed by insertions, to quickly find the correct
+insert location). So as long as we don't delete the leftmost page on each
+level, a search can never follow a downlink to page that's about to be
+deleted.
+
+The previous paragraph's reasoning only applies to searches, and only to
+posting trees. To protect from inserters following a downlink to a deleted
+page, vacuum simply locks out all concurrent insertions to the posting tree,
+by holding a super-exclusive lock on the posting tree root. Inserters hold a
+pin on the root page, but searches do not, so while new searches cannot begin
+while root page is locked, any already-in-progress scans can continue
+concurrently with vacuum. In the entry tree, we never delete pages.
+
+(This is quite different from the mechanism the btree indexam uses to make
+page-deletions safe; it stamps the deleted pages with an XID and keeps the
+deleted pages around with the right-link intact until all concurrent scans
+have finished.)
+
+Compatibility
+-------------
+
+Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in
+uncompressed format because of pg_upgrade from 9.3 or earlier versions.
+For compatibility, old uncompressed format is also supported. Following
+rules are used to handle it:
+
+* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list.
+This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid).
+Use GinItupIsCompressed(itup) to check the flag.
+
+* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag.
+ Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to
+ check and set this flag.
+
+* All scan operations check format of posting list add use corresponding code
+to read its content.
+
+* When updating an index tuple containing an uncompressed posting list, it
+will be replaced with new index tuple containing a compressed list.
+
+* When updating an uncompressed posting tree leaf page, it's compressed.
+
+* If vacuum finds some dead TIDs in uncompressed posting lists, they are
+converted into compressed posting lists. This assumes that the compressed
+posting list fits in the space occupied by the uncompressed list. IOW, we
+assume that the compressed version of the page, with the dead items removed,
+takes less space than the old uncompressed version.
+
Limitations
-----------
diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c
index f1f934331c..66cea28113 100644
--- a/src/backend/access/gin/ginarrayproc.c
+++ b/src/backend/access/gin/ginarrayproc.c
@@ -4,7 +4,7 @@
* support functions for GIN's indexing of any array
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -197,7 +197,7 @@ ginarrayconsistent(PG_FUNCTION_ARGS)
/*
* Must have all elements in check[] true; no discrimination
- * against nulls here. This is because array_contain_compare and
+ * against nulls here. This is because array_contain_compare and
* array_eq handle nulls differently ...
*/
res = true;
@@ -218,3 +218,88 @@ ginarrayconsistent(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(res);
}
+
+/*
+ * triconsistent support function
+ */
+Datum
+ginarraytriconsistent(PG_FUNCTION_ARGS)
+{
+ GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(5); */
+ bool *nullFlags = (bool *) PG_GETARG_POINTER(6);
+ GinTernaryValue res;
+ int32 i;
+
+ switch (strategy)
+ {
+ case GinOverlapStrategy:
+ /* must have a match for at least one non-null element */
+ res = GIN_FALSE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!nullFlags[i])
+ {
+ if (check[i] == GIN_TRUE)
+ {
+ res = GIN_TRUE;
+ break;
+ }
+ else if (check[i] == GIN_MAYBE && res == GIN_FALSE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ }
+ break;
+ case GinContainsStrategy:
+ /* must have all elements in check[] true, and no nulls */
+ res = GIN_TRUE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE || nullFlags[i])
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ break;
+ case GinContainedStrategy:
+ /* can't do anything else useful here */
+ res = GIN_MAYBE;
+ break;
+ case GinEqualStrategy:
+
+ /*
+ * Must have all elements in check[] true; no discrimination
+ * against nulls here. This is because array_contain_compare and
+ * array_eq handle nulls differently ...
+ */
+ res = GIN_MAYBE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "ginarrayconsistent: unknown strategy number: %d",
+ strategy);
+ res = false;
+ }
+
+ PG_RETURN_GIN_TERNARY_VALUE(res);
+}
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index b160551b5e..27f88e0eb2 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -4,7 +4,7 @@
* page utilities routines for the postgres inverted index access method.
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -18,8 +18,15 @@
#include "miscadmin.h"
#include "utils/rel.h"
+static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
+static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Buffer childbuf, GinStatsData *buildStats);
+static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
+ bool freestack, GinStatsData *buildStats);
+
/*
- * Locks buffer by needed method for search.
+ * Lock buffer by needed method for search.
*/
static int
ginTraverseLock(Buffer buffer, bool searchMode)
@@ -52,58 +59,51 @@ ginTraverseLock(Buffer buffer, bool searchMode)
return access;
}
-GinBtreeStack *
-ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno)
-{
- GinBtreeStack *stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
-
- stack->blkno = blkno;
- stack->buffer = ReadBuffer(btree->index, stack->blkno);
- stack->parent = NULL;
- stack->predictNumber = 1;
-
- ginTraverseLock(stack->buffer, btree->searchMode);
-
- return stack;
-}
-
/*
- * Locates leaf page contained tuple
+ * Descend the tree to the leaf page that contains or would contain the key
+ * we're searching for. The key should already be filled in 'btree', in
+ * tree-type specific manner. If btree->fullScan is true, descends to the
+ * leftmost leaf page.
+ *
+ * If 'searchmode' is false, on return stack->buffer is exclusively locked,
+ * and the stack represents the full path to the root. Otherwise stack->buffer
+ * is share-locked, and stack->parent is NULL.
*/
GinBtreeStack *
-ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
+ginFindLeafPage(GinBtree btree, bool searchMode)
{
- bool isfirst = TRUE;
- BlockNumber rootBlkno;
+ GinBtreeStack *stack;
- if (!stack)
- stack = ginPrepareFindLeafPage(btree, GIN_ROOT_BLKNO);
- rootBlkno = stack->blkno;
+ stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
+ stack->blkno = btree->rootBlkno;
+ stack->buffer = ReadBuffer(btree->index, btree->rootBlkno);
+ stack->parent = NULL;
+ stack->predictNumber = 1;
for (;;)
{
Page page;
BlockNumber child;
- int access = GIN_SHARE;
+ int access;
stack->off = InvalidOffsetNumber;
page = BufferGetPage(stack->buffer);
- if (isfirst)
- {
- if (GinPageIsLeaf(page) && !btree->searchMode)
- access = GIN_EXCLUSIVE;
- isfirst = FALSE;
- }
- else
- access = ginTraverseLock(stack->buffer, btree->searchMode);
+ access = ginTraverseLock(stack->buffer, searchMode);
+
+ /*
+ * If we're going to modify the tree, finish any incomplete splits we
+ * encounter on the way.
+ */
+ if (!searchMode && GinPageIsIncompleteSplit(page))
+ ginFinishSplit(btree, stack, false, NULL);
/*
* ok, page is correctly locked, we should check to move right ..,
* root never has a right link, so small optimization
*/
- while (btree->fullScan == FALSE && stack->blkno != rootBlkno &&
+ while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno &&
btree->isMoveRight(btree, page))
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
@@ -112,11 +112,12 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
/* rightmost page */
break;
+ stack->buffer = ginStepRight(stack->buffer, btree->index, access);
stack->blkno = rightlink;
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
- LockBuffer(stack->buffer, access);
page = BufferGetPage(stack->buffer);
+
+ if (!searchMode && GinPageIsIncompleteSplit(page))
+ ginFinishSplit(btree, stack, false, NULL);
}
if (GinPageIsLeaf(page)) /* we found, return locked page */
@@ -129,7 +130,7 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
Assert(child != InvalidBlockNumber);
Assert(stack->blkno != child);
- if (btree->searchMode)
+ if (searchMode)
{
/* in search mode we may forget path to leaf */
stack->blkno = child;
@@ -146,9 +147,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
stack->predictNumber = 1;
}
}
+}
- /* keep compiler happy */
- return NULL;
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+ Buffer nextbuffer;
+ Page page = BufferGetPage(buffer);
+ bool isLeaf = GinPageIsLeaf(page);
+ bool isData = GinPageIsData(page);
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ nextbuffer = ReadBuffer(index, blkno);
+ LockBuffer(nextbuffer, lockmode);
+ UnlockReleaseBuffer(buffer);
+
+ /* Sanity check that the page we stepped to is of similar kind. */
+ page = BufferGetPage(nextbuffer);
+ if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+ elog(ERROR, "right sibling of GIN page is of different type");
+
+ /*
+ * Given the proper lock sequence above, we should never land on a deleted
+ * page.
+ */
+ if (GinPageIsDeleted(page))
+ elog(ERROR, "right sibling of GIN page was deleted");
+
+ return nextbuffer;
}
void
@@ -167,172 +200,293 @@ freeGinBtreeStack(GinBtreeStack *stack)
}
/*
- * Try to find parent for current stack position, returns correct
- * parent and child's offset in stack->parent.
- * Function should never release root page to prevent conflicts
- * with vacuum process
+ * Try to find parent for current stack position. Returns correct parent and
+ * child's offset in stack->parent. The root page is never released, to
+ * to prevent conflict with vacuum process.
*/
-void
-ginFindParents(GinBtree btree, GinBtreeStack *stack,
- BlockNumber rootBlkno)
+static void
+ginFindParents(GinBtree btree, GinBtreeStack *stack)
{
-
Page page;
Buffer buffer;
BlockNumber blkno,
leftmostBlkno;
OffsetNumber offset;
- GinBtreeStack *root = stack->parent;
+ GinBtreeStack *root;
GinBtreeStack *ptr;
- if (!root)
+ /*
+ * Unwind the stack all the way up to the root, leaving only the root
+ * item.
+ *
+ * Be careful not to release the pin on the root page! The pin on root
+ * page is required to lock out concurrent vacuums on the tree.
+ */
+ root = stack->parent;
+ while (root->parent)
{
- /* XLog mode... */
- root = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
- root->blkno = rootBlkno;
- root->buffer = ReadBuffer(btree->index, rootBlkno);
- LockBuffer(root->buffer, GIN_EXCLUSIVE);
- root->parent = NULL;
+ ReleaseBuffer(root->buffer);
+ root = root->parent;
}
- else
- {
- /*
- * find root, we should not release root page until update is
- * finished!!
- */
- while (root->parent)
- {
- ReleaseBuffer(root->buffer);
- root = root->parent;
- }
- Assert(root->blkno == rootBlkno);
- Assert(BufferGetBlockNumber(root->buffer) == rootBlkno);
- LockBuffer(root->buffer, GIN_EXCLUSIVE);
- }
+ Assert(root->blkno == btree->rootBlkno);
+ Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno);
root->off = InvalidOffsetNumber;
- page = BufferGetPage(root->buffer);
- Assert(!GinPageIsLeaf(page));
-
- /* check trivial case */
- if ((root->off = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) != InvalidOffsetNumber)
- {
- stack->parent = root;
- return;
- }
+ blkno = root->blkno;
+ buffer = root->buffer;
+ offset = InvalidOffsetNumber;
- leftmostBlkno = blkno = btree->getLeftMostPage(btree, page);
- LockBuffer(root->buffer, GIN_UNLOCK);
- Assert(blkno != InvalidBlockNumber);
+ ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
for (;;)
{
- buffer = ReadBuffer(btree->index, blkno);
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
if (GinPageIsLeaf(page))
elog(ERROR, "Lost path");
- leftmostBlkno = btree->getLeftMostPage(btree, page);
+ if (GinPageIsIncompleteSplit(page))
+ {
+ Assert(blkno != btree->rootBlkno);
+ ptr->blkno = blkno;
+ ptr->buffer = buffer;
+
+ /*
+ * parent may be wrong, but if so, the ginFinishSplit call will
+ * recurse to call ginFindParents again to fix it.
+ */
+ ptr->parent = root;
+ ptr->off = InvalidOffsetNumber;
+
+ ginFinishSplit(btree, ptr, false, NULL);
+ }
+
+ leftmostBlkno = btree->getLeftMostChild(btree, page);
while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
{
blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- ReleaseBuffer(buffer);
if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(buffer);
break;
- buffer = ReadBuffer(btree->index, blkno);
- LockBuffer(buffer, GIN_EXCLUSIVE);
+ }
+ buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
+
+ /* finish any incomplete splits, as above */
+ if (GinPageIsIncompleteSplit(page))
+ {
+ Assert(blkno != btree->rootBlkno);
+ ptr->blkno = blkno;
+ ptr->buffer = buffer;
+ ptr->parent = root;
+ ptr->off = InvalidOffsetNumber;
+
+ ginFinishSplit(btree, ptr, false, NULL);
+ }
}
if (blkno != InvalidBlockNumber)
{
- ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
ptr->blkno = blkno;
ptr->buffer = buffer;
- ptr->parent = root; /* it's may be wrong, but in next call we will
+ ptr->parent = root; /* it may be wrong, but in next call we will
* correct */
ptr->off = offset;
stack->parent = ptr;
return;
}
+ /* Descend down to next level */
blkno = leftmostBlkno;
+ buffer = ReadBuffer(btree->index, blkno);
}
}
/*
- * Insert value (stored in GinBtree) to tree described by stack
+ * Insert a new item to a page.
*
- * During an index build, buildStats is non-null and the counters
- * it contains should be incremented as needed.
+ * Returns true if the insertion was finished. On false, the page was split and
+ * the parent needs to be updated. (a root split returns true as it doesn't
+ * need any further action by the caller to complete)
*
- * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
+ * When inserting a downlink to a internal page, 'childbuf' contains the
+ * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
+ * atomically with the insert. Also, the existing item at the given location
+ * is updated to point to 'updateblkno'.
+ *
+ * stack->buffer is locked on entry, and is kept locked.
*/
-void
-ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
+static bool
+ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Buffer childbuf, GinStatsData *buildStats)
{
- GinBtreeStack *parent = stack;
- BlockNumber rootBlkno = InvalidBuffer;
- Page page,
- rpage,
- lpage;
-
- /* remember root BlockNumber */
- while (parent)
+ Page page = BufferGetPage(stack->buffer);
+ XLogRecData *payloadrdata;
+ GinPlaceToPageRC rc;
+ uint16 xlflags = 0;
+ Page childpage = NULL;
+ Page newlpage = NULL,
+ newrpage = NULL;
+
+ if (GinPageIsData(page))
+ xlflags |= GIN_INSERT_ISDATA;
+ if (GinPageIsLeaf(page))
+ {
+ xlflags |= GIN_INSERT_ISLEAF;
+ Assert(!BufferIsValid(childbuf));
+ Assert(updateblkno == InvalidBlockNumber);
+ }
+ else
{
- rootBlkno = parent->blkno;
- parent = parent->parent;
+ Assert(BufferIsValid(childbuf));
+ Assert(updateblkno != InvalidBlockNumber);
+ childpage = BufferGetPage(childbuf);
}
- while (stack)
+ /*
+ * Try to put the incoming tuple on the page. placeToPage will decide if
+ * the page needs to be split.
+ */
+ rc = btree->placeToPage(btree, stack->buffer, stack,
+ insertdata, updateblkno,
+ &payloadrdata, &newlpage, &newrpage);
+ if (rc == UNMODIFIED)
+ return true;
+ else if (rc == INSERTED)
{
- XLogRecData *rdata;
- BlockNumber savedRightLink;
+ /* placeToPage did START_CRIT_SECTION() */
+ MarkBufferDirty(stack->buffer);
- page = BufferGetPage(stack->buffer);
- savedRightLink = GinPageGetOpaque(page)->rightlink;
+ /* An insert to an internal page finishes the split of the child. */
+ if (childbuf != InvalidBuffer)
+ {
+ GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+ MarkBufferDirty(childbuf);
+ }
- if (btree->isEnoughSpace(btree, stack->buffer, stack->off))
+ if (RelationNeedsWAL(btree->index))
{
- START_CRIT_SECTION();
- btree->placeToPage(btree, stack->buffer, stack->off, &rdata);
+ XLogRecPtr recptr;
+ XLogRecData rdata[3];
+ ginxlogInsert xlrec;
+ BlockIdData childblknos[2];
- MarkBufferDirty(stack->buffer);
+ xlrec.node = btree->index->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(stack->buffer);
+ xlrec.flags = xlflags;
- if (RelationNeedsWAL(btree->index))
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = sizeof(ginxlogInsert);
+
+ /*
+ * Log information about child if this was an insertion of a
+ * downlink.
+ */
+ if (childbuf != InvalidBuffer)
{
- XLogRecPtr recptr;
+ rdata[0].next = &rdata[1];
- recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
+ BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf));
+ BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink);
+
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].data = (char *) childblknos;
+ rdata[1].len = sizeof(BlockIdData) * 2;
+ rdata[1].next = &rdata[2];
+
+ rdata[2].buffer = childbuf;
+ rdata[2].buffer_std = false;
+ rdata[2].data = NULL;
+ rdata[2].len = 0;
+ rdata[2].next = payloadrdata;
}
+ else
+ rdata[0].next = payloadrdata;
- LockBuffer(stack->buffer, GIN_UNLOCK);
- END_CRIT_SECTION();
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
+ PageSetLSN(page, recptr);
+ if (childbuf != InvalidBuffer)
+ PageSetLSN(childpage, recptr);
+ }
- freeGinBtreeStack(stack);
+ END_CRIT_SECTION();
- return;
+ return true;
+ }
+ else if (rc == SPLIT)
+ {
+ /* Didn't fit, have to split */
+ Buffer rbuffer;
+ BlockNumber savedRightLink;
+ XLogRecData rdata[2];
+ ginxlogSplit data;
+ Buffer lbuffer = InvalidBuffer;
+ Page newrootpg = NULL;
+
+ rbuffer = GinNewBuffer(btree->index);
+
+ /* During index build, count the new page */
+ if (buildStats)
+ {
+ if (btree->isData)
+ buildStats->nDataPages++;
+ else
+ buildStats->nEntryPages++;
+ }
+
+ savedRightLink = GinPageGetOpaque(page)->rightlink;
+
+ /*
+ * newlpage and newrpage are pointers to memory pages, not associated
+ * with buffers. stack->buffer is not touched yet.
+ */
+
+ data.node = btree->index->rd_node;
+ data.rblkno = BufferGetBlockNumber(rbuffer);
+ data.flags = xlflags;
+ if (childbuf != InvalidBuffer)
+ {
+ Page childpage = BufferGetPage(childbuf);
+
+ GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+
+ data.leftChildBlkno = BufferGetBlockNumber(childbuf);
+ data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
}
else
+ data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &data;
+ rdata[0].len = sizeof(ginxlogSplit);
+
+ if (childbuf != InvalidBuffer)
{
- Buffer rbuffer = GinNewBuffer(btree->index);
- Page newlpage;
+ rdata[0].next = &rdata[1];
+
+ rdata[1].buffer = childbuf;
+ rdata[1].buffer_std = false;
+ rdata[1].data = NULL;
+ rdata[1].len = 0;
+ rdata[1].next = payloadrdata;
+ }
+ else
+ rdata[0].next = payloadrdata;
+ if (stack->parent == NULL)
+ {
/*
- * newlpage is a pointer to memory page, it doesn't associate with
- * buffer, stack->buffer should be untouched
+ * split root, so we need to allocate new left page and place
+ * pointer on root to left and right page
*/
- newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
-
- ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
+ lbuffer = GinNewBuffer(btree->index);
- /* During index build, count the newly-split page */
+ /* During index build, count the newly-added root page */
if (buildStats)
{
if (btree->isData)
@@ -341,134 +495,259 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
buildStats->nEntryPages++;
}
- parent = stack->parent;
+ /*
+ * root never has a right-link, so we borrow the rrlink field to
+ * store the root block number.
+ */
+ data.rrlink = BufferGetBlockNumber(stack->buffer);
+ data.lblkno = BufferGetBlockNumber(lbuffer);
+ data.flags |= GIN_SPLIT_ROOT;
- if (parent == NULL)
- {
- /*
- * split root, so we need to allocate new left page and place
- * pointer on root to left and right page
- */
- Buffer lbuffer = GinNewBuffer(btree->index);
-
- ((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
- ((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
-
- page = BufferGetPage(stack->buffer);
- lpage = BufferGetPage(lbuffer);
- rpage = BufferGetPage(rbuffer);
-
- GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber;
- GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
- ((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer);
-
- START_CRIT_SECTION();
-
- GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF);
- PageRestoreTempPage(newlpage, lpage);
- btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer);
-
- MarkBufferDirty(rbuffer);
- MarkBufferDirty(lbuffer);
- MarkBufferDirty(stack->buffer);
-
- if (RelationNeedsWAL(btree->index))
- {
- XLogRecPtr recptr;
-
- recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- PageSetLSN(lpage, recptr);
- PageSetTLI(lpage, ThisTimeLineID);
- PageSetLSN(rpage, recptr);
- PageSetTLI(rpage, ThisTimeLineID);
- }
-
- UnlockReleaseBuffer(rbuffer);
- UnlockReleaseBuffer(lbuffer);
- LockBuffer(stack->buffer, GIN_UNLOCK);
- END_CRIT_SECTION();
-
- freeGinBtreeStack(stack);
-
- /* During index build, count the newly-added root page */
- if (buildStats)
- {
- if (btree->isData)
- buildStats->nDataPages++;
- else
- buildStats->nEntryPages++;
- }
-
- return;
- }
- else
- {
- /* split non-root page */
- ((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE;
- ((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink;
-
- lpage = BufferGetPage(stack->buffer);
- rpage = BufferGetPage(rbuffer);
-
- GinPageGetOpaque(rpage)->rightlink = savedRightLink;
- GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
-
- START_CRIT_SECTION();
- PageRestoreTempPage(newlpage, lpage);
-
- MarkBufferDirty(rbuffer);
- MarkBufferDirty(stack->buffer);
-
- if (RelationNeedsWAL(btree->index))
- {
- XLogRecPtr recptr;
-
- recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
- PageSetLSN(lpage, recptr);
- PageSetTLI(lpage, ThisTimeLineID);
- PageSetLSN(rpage, recptr);
- PageSetTLI(rpage, ThisTimeLineID);
- }
- UnlockReleaseBuffer(rbuffer);
- END_CRIT_SECTION();
- }
+ GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber;
+ GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+
+ /*
+ * Construct a new root page containing downlinks to the new left
+ * and right pages. (do this in a temporary copy first rather than
+ * overwriting the original page directly, so that we can still
+ * abort gracefully if this fails.)
+ */
+ newrootpg = PageGetTempPage(newrpage);
+ GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
+
+ btree->fillRoot(btree, newrootpg,
+ BufferGetBlockNumber(lbuffer), newlpage,
+ BufferGetBlockNumber(rbuffer), newrpage);
+ }
+ else
+ {
+ /* split non-root page */
+ data.rrlink = savedRightLink;
+ data.lblkno = BufferGetBlockNumber(stack->buffer);
+
+ GinPageGetOpaque(newrpage)->rightlink = savedRightLink;
+ GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT;
+ GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+ }
+
+ /*
+ * Ok, we have the new contents of the left page in a temporary copy
+ * now (newlpage), and the newly-allocated right block has been filled
+ * in. The original page is still unchanged.
+ *
+ * If this is a root split, we also have a temporary page containing
+ * the new contents of the root. Copy the new left page to a
+ * newly-allocated block, and initialize the (original) root page the
+ * new copy. Otherwise, copy over the temporary copy of the new left
+ * page over the old left page.
+ */
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(rbuffer);
+ MarkBufferDirty(stack->buffer);
+ if (BufferIsValid(childbuf))
+ MarkBufferDirty(childbuf);
+
+ /*
+ * Restore the temporary copies over the real buffers. But don't free
+ * the temporary copies yet, WAL record data points to them.
+ */
+ if (stack->parent == NULL)
+ {
+ MarkBufferDirty(lbuffer);
+ memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
+ memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
+ memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
}
+ else
+ {
+ memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
+ memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
+ }
+
+ /* write WAL record */
+ if (RelationNeedsWAL(btree->index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
+ PageSetLSN(BufferGetPage(stack->buffer), recptr);
+ PageSetLSN(BufferGetPage(rbuffer), recptr);
+ if (stack->parent == NULL)
+ PageSetLSN(BufferGetPage(lbuffer), recptr);
+ if (BufferIsValid(childbuf))
+ PageSetLSN(childpage, recptr);
+ }
+ END_CRIT_SECTION();
+
+ /*
+ * We can release the lock on the right page now, but keep the
+ * original buffer locked.
+ */
+ UnlockReleaseBuffer(rbuffer);
+ if (stack->parent == NULL)
+ UnlockReleaseBuffer(lbuffer);
+
+ pfree(newlpage);
+ pfree(newrpage);
+ if (newrootpg)
+ pfree(newrootpg);
+
+ /*
+ * If we split the root, we're done. Otherwise the split is not
+ * complete until the downlink for the new page has been inserted to
+ * the parent.
+ */
+ if (stack->parent == NULL)
+ return true;
+ else
+ return false;
+ }
+ else
+ {
+ elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
+ return false; /* keep compiler quiet */
+ }
+}
- btree->isDelete = FALSE;
+/*
+ * Finish a split by inserting the downlink for the new page to parent.
+ *
+ * On entry, stack->buffer is exclusively locked.
+ *
+ * If freestack is true, all the buffers are released and unlocked as we
+ * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept
+ * locked, and stack is unmodified, except for possibly moving right to find
+ * the correct parent of page.
+ */
+static void
+ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
+ GinStatsData *buildStats)
+{
+ Page page;
+ bool done;
+ bool first = true;
+
+ /*
+ * freestack == false when we encounter an incompletely split page during
+ * a scan, while freestack == true is used in the normal scenario that a
+ * split is finished right after the initial insert.
+ */
+ if (!freestack)
+ elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
+ stack->blkno, RelationGetRelationName(btree->index));
+
+ /* this loop crawls up the stack until the insertion is complete */
+ do
+ {
+ GinBtreeStack *parent = stack->parent;
+ void *insertdata;
+ BlockNumber updateblkno;
/* search parent to lock */
LockBuffer(parent->buffer, GIN_EXCLUSIVE);
+ /*
+ * If the parent page was incompletely split, finish that split first,
+ * then continue with the current one.
+ *
+ * Note: we have to finish *all* incomplete splits we encounter, even
+ * if we have to move right. Otherwise we might choose as the target a
+ * page that has no downlink in the parent, and splitting it further
+ * would fail.
+ */
+ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+ ginFinishSplit(btree, parent, false, buildStats);
+
/* move right if it's needed */
page = BufferGetPage(parent->buffer);
while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber)
{
- BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
-
- LockBuffer(parent->buffer, GIN_UNLOCK);
-
- if (rightlink == InvalidBlockNumber)
+ if (GinPageRightMost(page))
{
/*
* rightmost page, but we don't find parent, we should use
* plain search...
*/
- ginFindParents(btree, stack, rootBlkno);
+ LockBuffer(parent->buffer, GIN_UNLOCK);
+ ginFindParents(btree, stack);
parent = stack->parent;
- page = BufferGetPage(parent->buffer);
+ Assert(parent != NULL);
break;
}
- parent->blkno = rightlink;
- parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
- LockBuffer(parent->buffer, GIN_EXCLUSIVE);
+ parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
+ parent->blkno = BufferGetBlockNumber(parent->buffer);
page = BufferGetPage(parent->buffer);
+
+ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+ ginFinishSplit(btree, parent, false, buildStats);
}
- UnlockReleaseBuffer(stack->buffer);
- pfree(stack);
+ /* insert the downlink */
+ insertdata = btree->prepareDownlink(btree, stack->buffer);
+ updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink;
+ done = ginPlaceToPage(btree, parent,
+ insertdata, updateblkno,
+ stack->buffer, buildStats);
+ pfree(insertdata);
+
+ /*
+ * If the caller requested to free the stack, unlock and release the
+ * child buffer now. Otherwise keep it pinned and locked, but if we
+ * have to recurse up the tree, we can unlock the upper pages, only
+ * keeping the page at the bottom of the stack locked.
+ */
+ if (!first || freestack)
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ if (freestack)
+ {
+ ReleaseBuffer(stack->buffer);
+ pfree(stack);
+ }
stack = parent;
+
+ first = false;
+ } while (!done);
+
+ /* unlock the parent */
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+
+ if (freestack)
+ freeGinBtreeStack(stack);
+}
+
+/*
+ * Insert a value to tree described by stack.
+ *
+ * The value to be inserted is given in 'insertdata'. Its format depends
+ * on whether this is an entry or data tree, ginInsertValue just passes it
+ * through to the tree-specific callback function.
+ *
+ * During an index build, buildStats is non-null and the counters it contains
+ * are incremented as needed.
+ *
+ * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
+ */
+void
+ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,
+ GinStatsData *buildStats)
+{
+ bool done;
+
+ /* If the leaf page was incompletely split, finish the split first */
+ if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
+ ginFinishSplit(btree, stack, false, buildStats);
+
+ done = ginPlaceToPage(btree, stack,
+ insertdata, InvalidBlockNumber,
+ InvalidBuffer, buildStats);
+ if (done)
+ {
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stack);
}
+ else
+ ginFinishSplit(btree, stack, true, buildStats);
}
diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c
index ddae862720..3af027187a 100644
--- a/src/backend/access/gin/ginbulk.c
+++ b/src/backend/access/gin/ginbulk.c
@@ -4,7 +4,7 @@
* routines for fast build of inverted index
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -187,7 +187,7 @@ ginInsertBAEntry(BuildAccumulator *accum,
* Since the entries are being inserted into a balanced binary tree, you
* might think that the order of insertion wouldn't be critical, but it turns
* out that inserting the entries in sorted order results in a lot of
- * rebalancing operations and is slow. To prevent this, we attempt to insert
+ * rebalancing operations and is slow. To prevent this, we attempt to insert
* the nodes in an order that will produce a nearly-balanced tree if the input
* is in fact sorted.
*
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 13601468ed..272a9ca7c0 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -1,10 +1,10 @@
/*-------------------------------------------------------------------------
*
* gindatapage.c
- * page utilities routines for the postgres inverted index access method.
+ * routines for handling GIN posting tree pages.
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -15,69 +15,214 @@
#include "postgres.h"
#include "access/gin_private.h"
+#include "access/heapam_xlog.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
#include "utils/rel.h"
-int
-ginCompareItemPointers(ItemPointer a, ItemPointer b)
+/*
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size, it is merged with the next posting list.
+ */
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
+
+/*
+ * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
+ * long segment. This is used when estimating how much space is required
+ * for N items, at minimum.
+ */
+#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
+
+/*
+ * A working struct for manipulating a posting tree leaf page.
+ */
+typedef struct
{
- BlockNumber ba = GinItemPointerGetBlockNumber(a);
- BlockNumber bb = GinItemPointerGetBlockNumber(b);
+ dlist_head segments; /* a list of leafSegmentInfos */
- if (ba == bb)
- {
- OffsetNumber oa = GinItemPointerGetOffsetNumber(a);
- OffsetNumber ob = GinItemPointerGetOffsetNumber(b);
+ /*
+ * The following fields represent how the segments are split across pages,
+ * if a page split is required. Filled in by leafRepackItems.
+ */
+ dlist_node *lastleft; /* last segment on left page */
+ int lsize; /* total size on left page */
+ int rsize; /* total size on right page */
- if (oa == ob)
- return 0;
- return (oa > ob) ? 1 : -1;
- }
+ bool oldformat; /* page is in pre-9.4 format on disk */
+} disassembledLeaf;
- return (ba > bb) ? 1 : -1;
-}
+typedef struct
+{
+ dlist_node node; /* linked list pointers */
+
+ /*-------------
+ * 'action' indicates the status of this in-memory segment, compared to
+ * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+ *
+ * UNMODIFIED no changes
+ * DELETE the segment is to be removed. 'seg' and 'items' are
+ * ignored
+ * INSERT this is a completely new segment
+ * REPLACE this replaces an existing segment with new content
+ * ADDITEMS like REPLACE, but no items have been removed, and we track
+ * in detail what items have been added to this segment, in
+ * 'modifieditems'
+ *-------------
+ */
+ char action;
+
+ ItemPointerData *modifieditems;
+ int nmodifieditems;
+
+ /*
+ * The following fields represent the items in this segment. If 'items' is
+ * not NULL, it contains a palloc'd array of the itemsin this segment. If
+ * 'seg' is not NULL, it contains the items in an already-compressed
+ * format. It can point to an on-disk page (!modified), or a palloc'd
+ * segment in memory. If both are set, they must represent the same items.
+ */
+ GinPostingList *seg;
+ ItemPointer items;
+ int nitems; /* # of items in 'items', if items != NULL */
+} leafSegmentInfo;
+
+static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
+static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ XLogRecData **prdata, Page *newlpage, Page *newrpage);
+
+static disassembledLeaf *disassembleLeaf(Page page);
+static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
+static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
+ int nNewItems);
+
+static XLogRecData *constructLeafRecompressWALData(Buffer buf,
+ disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
+static void dataPlaceToPageLeafSplit(Buffer buf,
+ disassembledLeaf *leaf,
+ ItemPointerData lbound, ItemPointerData rbound,
+ XLogRecData **prdata, Page lpage, Page rpage);
/*
- * Merge two ordered arrays of itempointers, eliminating any duplicates.
- * Returns the number of items in the result.
- * Caller is responsible that there is enough space at *dst.
+ * Read TIDs from leaf data page to single uncompressed array. The TIDs are
+ * returned in ascending order.
+ *
+ * advancePast is a hint, indicating that the caller is only interested in
+ * TIDs > advancePast. To return all items, use ItemPointerSetMin.
+ *
+ * Note: This function can still return items smaller than advancePast that
+ * are in the same posting list as the items of interest, so the caller must
+ * still check all the returned items. But passing it allows this function to
+ * skip whole posting lists.
*/
-uint32
-ginMergeItemPointers(ItemPointerData *dst,
- ItemPointerData *a, uint32 na,
- ItemPointerData *b, uint32 nb)
+ItemPointer
+GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
{
- ItemPointerData *dptr = dst;
- ItemPointerData *aptr = a,
- *bptr = b;
+ ItemPointer result;
- while (aptr - a < na && bptr - b < nb)
+ if (GinPageIsCompressed(page))
{
- int cmp = ginCompareItemPointers(aptr, bptr);
+ GinPostingList *seg = GinDataLeafPageGetPostingList(page);
+ Size len = GinDataLeafPageGetPostingListSize(page);
+ Pointer endptr = ((Pointer) seg) + len;
+ GinPostingList *next;
- if (cmp > 0)
- *dptr++ = *bptr++;
- else if (cmp == 0)
+ /* Skip to the segment containing advancePast+1 */
+ if (ItemPointerIsValid(&advancePast))
{
- /* we want only one copy of the identical items */
- *dptr++ = *bptr++;
- aptr++;
+ next = GinNextPostingListSegment(seg);
+ while ((Pointer) next < endptr &&
+ ginCompareItemPointers(&next->first, &advancePast) <= 0)
+ {
+ seg = next;
+ next = GinNextPostingListSegment(seg);
+ }
+ len = endptr - (Pointer) seg;
}
+
+ if (len > 0)
+ result = ginPostingListDecodeAllSegments(seg, len, nitems);
else
- *dptr++ = *aptr++;
+ {
+ result = NULL;
+ *nitems = 0;
+ }
}
+ else
+ {
+ ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
- while (aptr - a < na)
- *dptr++ = *aptr++;
+ result = palloc((*nitems) * sizeof(ItemPointerData));
+ memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
+ }
- while (bptr - b < nb)
- *dptr++ = *bptr++;
+ return result;
+}
- return dptr - dst;
+/*
+ * Places all TIDs from leaf data page to bitmap.
+ */
+int
+GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
+{
+ ItemPointer uncompressed;
+ int nitems;
+
+ if (GinPageIsCompressed(page))
+ {
+ GinPostingList *segment = GinDataLeafPageGetPostingList(page);
+ Size len = GinDataLeafPageGetPostingListSize(page);
+
+ nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
+ }
+ else
+ {
+ uncompressed = dataLeafPageGetUncompressed(page, &nitems);
+
+ if (nitems > 0)
+ tbm_add_tuples(tbm, uncompressed, nitems, false);
+ }
+
+ return nitems;
}
/*
- * Checks, should we move to right link...
- * Compares inserting itemp pointer with right bound of current page
+ * Get pointer to the uncompressed array of items on a pre-9.4 format
+ * uncompressed leaf page. The number of items in the array is returned in
+ * *nitems.
+ */
+static ItemPointer
+dataLeafPageGetUncompressed(Page page, int *nitems)
+{
+ ItemPointer items;
+
+ Assert(!GinPageIsCompressed(page));
+
+ /*
+ * In the old pre-9.4 page format, the whole page content is used for
+ * uncompressed items, and the number of items is stored in 'maxoff'
+ */
+ items = (ItemPointer) GinDataPageGetData(page);
+ *nitems = GinPageGetOpaque(page)->maxoff;
+
+ return items;
+}
+
+/*
+ * Check if we should follow the right link to find the item we're searching
+ * for.
+ *
+ * Compares inserting item pointer with the right bound of the current page.
*/
static bool
dataIsMoveRight(GinBtree btree, Page page)
@@ -87,12 +232,12 @@ dataIsMoveRight(GinBtree btree, Page page)
if (GinPageRightMost(page))
return FALSE;
- return (ginCompareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE;
+ return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? TRUE : FALSE;
}
/*
- * Find correct PostingItem in non-leaf page. It supposed that page
- * correctly chosen and searching value SHOULD be on page
+ * Find correct PostingItem in non-leaf page. It is assumed that this is
+ * the correct page, and the searched value SHOULD be on the page.
*/
static BlockNumber
dataLocateItem(GinBtree btree, GinBtreeStack *stack)
@@ -111,7 +256,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
{
stack->off = FirstOffsetNumber;
stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
- return btree->getLeftMostPage(btree, page);
+ return btree->getLeftMostChild(btree, page);
}
low = FirstOffsetNumber;
@@ -124,7 +269,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
{
OffsetNumber mid = low + ((high - low) / 2);
- pitem = (PostingItem *) GinDataPageGetItem(page, mid);
+ pitem = GinDataPageGetPostingItem(page, mid);
if (mid == maxoff)
{
@@ -136,8 +281,8 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
}
else
{
- pitem = (PostingItem *) GinDataPageGetItem(page, mid);
- result = ginCompareItemPointers(btree->items + btree->curitem, &(pitem->key));
+ pitem = GinDataPageGetPostingItem(page, mid);
+ result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
}
if (result == 0)
@@ -154,67 +299,12 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
Assert(high >= FirstOffsetNumber && high <= maxoff);
stack->off = high;
- pitem = (PostingItem *) GinDataPageGetItem(page, high);
+ pitem = GinDataPageGetPostingItem(page, high);
return PostingItemGetBlockNumber(pitem);
}
/*
- * Searches correct position for value on leaf page.
- * Page should be correctly chosen.
- * Returns true if value found on page.
- */
-static bool
-dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack)
-{
- Page page = BufferGetPage(stack->buffer);
- OffsetNumber low,
- high;
- int result;
-
- Assert(GinPageIsLeaf(page));
- Assert(GinPageIsData(page));
-
- if (btree->fullScan)
- {
- stack->off = FirstOffsetNumber;
- return TRUE;
- }
-
- low = FirstOffsetNumber;
- high = GinPageGetOpaque(page)->maxoff;
-
- if (high < low)
- {
- stack->off = FirstOffsetNumber;
- return false;
- }
-
- high++;
-
- while (high > low)
- {
- OffsetNumber mid = low + ((high - low) / 2);
-
- result = ginCompareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid));
-
- if (result == 0)
- {
- stack->off = mid;
- return true;
- }
- else if (result > 0)
- low = mid + 1;
- else
- high = mid;
- }
-
- stack->off = high;
- return false;
-}
-
-/*
- * Finds links to blkno on non-leaf page, returns
- * offset of PostingItem
+ * Find link to blkno on non-leaf page, returns offset of PostingItem
*/
static OffsetNumber
dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
@@ -229,7 +319,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
/* if page isn't changed, we return storedOff */
if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
{
- pitem = (PostingItem *) GinDataPageGetItem(page, storedOff);
+ pitem = GinDataPageGetPostingItem(page, storedOff);
if (PostingItemGetBlockNumber(pitem) == blkno)
return storedOff;
@@ -239,7 +329,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
*/
for (i = storedOff + 1; i <= maxoff; i++)
{
- pitem = (PostingItem *) GinDataPageGetItem(page, i);
+ pitem = GinDataPageGetPostingItem(page, i);
if (PostingItemGetBlockNumber(pitem) == blkno)
return i;
}
@@ -250,7 +340,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
/* last chance */
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
- pitem = (PostingItem *) GinDataPageGetItem(page, i);
+ pitem = GinDataPageGetPostingItem(page, i);
if (PostingItemGetBlockNumber(pitem) == blkno)
return i;
}
@@ -259,7 +349,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
}
/*
- * returns blkno of leftmost child
+ * Return blkno of leftmost child
*/
static BlockNumber
dataGetLeftMostPage(GinBtree btree, Page page)
@@ -270,39 +360,49 @@ dataGetLeftMostPage(GinBtree btree, Page page)
Assert(GinPageIsData(page));
Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
- pitem = (PostingItem *) GinDataPageGetItem(page, FirstOffsetNumber);
+ pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
return PostingItemGetBlockNumber(pitem);
}
/*
- * add ItemPointer or PostingItem to page. data should point to
- * correct value! depending on leaf or non-leaf page
+ * Add PostingItem to a non-leaf page.
*/
void
-GinDataPageAddItem(Page page, void *data, OffsetNumber offset)
+GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
{
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
char *ptr;
+ Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
+ Assert(!GinPageIsLeaf(page));
+
if (offset == InvalidOffsetNumber)
{
- ptr = GinDataPageGetItem(page, maxoff + 1);
+ ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
}
else
{
- ptr = GinDataPageGetItem(page, offset);
- if (maxoff + 1 - offset != 0)
- memmove(ptr + GinSizeOfDataPageItem(page),
+ ptr = (char *) GinDataPageGetPostingItem(page, offset);
+ if (offset != maxoff + 1)
+ memmove(ptr + sizeof(PostingItem),
ptr,
- (maxoff - offset + 1) * GinSizeOfDataPageItem(page));
+ (maxoff - offset + 1) *sizeof(PostingItem));
}
- memcpy(ptr, data, GinSizeOfDataPageItem(page));
+ memcpy(ptr, data, sizeof(PostingItem));
+
+ maxoff++;
+ GinPageGetOpaque(page)->maxoff = maxoff;
- GinPageGetOpaque(page)->maxoff++;
+ /*
+ * Also set pd_lower to the end of the posting items, to follow the
+ * "standard" page layout, so that we can squeeze out the unused space
+ * from full-page images.
+ */
+ GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
}
/*
- * Deletes posting item from non-leaf page
+ * Delete posting item from non-leaf page
*/
void
GinPageDeletePostingItem(Page page, OffsetNumber offset)
@@ -313,271 +413,892 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset)
Assert(offset >= FirstOffsetNumber && offset <= maxoff);
if (offset != maxoff)
- memmove(GinDataPageGetItem(page, offset), GinDataPageGetItem(page, offset + 1),
+ memmove(GinDataPageGetPostingItem(page, offset),
+ GinDataPageGetPostingItem(page, offset + 1),
sizeof(PostingItem) * (maxoff - offset));
- GinPageGetOpaque(page)->maxoff--;
+ maxoff--;
+ GinPageGetOpaque(page)->maxoff = maxoff;
+
+ GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
}
/*
- * checks space to install new value,
- * item pointer never deletes!
+ * Places keys to leaf data page and fills WAL record.
*/
-static bool
-dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
+static GinPlaceToPageRC
+dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, XLogRecData **prdata,
+ Page *newlpage, Page *newrpage)
{
+ GinBtreeDataLeafInsertData *items = insertdata;
+ ItemPointer newItems = &items->items[items->curitem];
+ int maxitems = items->nitem - items->curitem;
Page page = BufferGetPage(buf);
+ int i;
+ ItemPointerData rbound;
+ ItemPointerData lbound;
+ bool needsplit;
+ bool append;
+ int segsize;
+ Size freespace;
+ MemoryContext tmpCxt;
+ MemoryContext oldCxt;
+ disassembledLeaf *leaf;
+ leafSegmentInfo *lastleftinfo;
+ ItemPointerData maxOldItem;
+ ItemPointerData remaining;
Assert(GinPageIsData(page));
- Assert(!btree->isDelete);
- if (GinPageIsLeaf(page))
+ rbound = *GinDataPageGetRightBound(page);
+
+ /*
+ * Count how many of the new items belong to this page.
+ */
+ if (!GinPageRightMost(page))
{
- if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+ for (i = 0; i < maxitems; i++)
{
- if ((btree->nitem - btree->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
- return true;
+ if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
+ {
+ /*
+ * This needs to go to some other location in the tree. (The
+ * caller should've chosen the insert location so that at
+ * least the first item goes here.)
+ */
+ Assert(i > 0);
+ break;
+ }
}
- else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
- return true;
+ maxitems = i;
}
- else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page))
- return true;
- return false;
-}
+ /*
+ * The following operations do quite a lot of small memory allocations,
+ * create a temporary memory context so that we don't need to keep track
+ * of them individually.
+ */
+ tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin split temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldCxt = MemoryContextSwitchTo(tmpCxt);
-/*
- * In case of previous split update old child blkno to
- * new right page
- * item pointer never deletes!
- */
-static BlockNumber
-dataPrepareData(GinBtree btree, Page page, OffsetNumber off)
-{
- BlockNumber ret = InvalidBlockNumber;
+ leaf = disassembleLeaf(page);
- Assert(GinPageIsData(page));
+ /*
+ * Are we appending to the end of the page? IOW, are all the new items
+ * larger than any of the existing items.
+ */
+ if (!dlist_is_empty(&leaf->segments))
+ {
+ lastleftinfo = dlist_container(leafSegmentInfo, node,
+ dlist_tail_node(&leaf->segments));
+ if (!lastleftinfo->items)
+ lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+ &lastleftinfo->nitems);
+ maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
+ if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
+ append = true;
+ else
+ append = false;
+ }
+ else
+ {
+ ItemPointerSetMin(&maxOldItem);
+ append = true;
+ }
- if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
+ /*
+ * If we're appending to the end of the page, we will append as many items
+ * as we can fit (after splitting), and stop when the pages becomes full.
+ * Otherwise we have to limit the number of new items to insert, because
+ * once we start packing we can't just stop when we run out of space,
+ * because we must make sure that all the old items still fit.
+ */
+ if (GinPageIsCompressed(page))
+ freespace = GinDataLeafPageGetFreeSpace(page);
+ else
+ freespace = 0;
+ if (append)
{
- PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, off);
+ /*
+ * Even when appending, trying to append more items than will fit is
+ * not completely free, because we will merge the new items and old
+ * items into an array below. In the best case, every new item fits in
+ * a single byte, and we can use all the free space on the old page as
+ * well as the new page. For simplicity, ignore segment overhead etc.
+ */
+ maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
+ }
+ else
+ {
+ /*
+ * Calculate a conservative estimate of how many new items we can fit
+ * on the two pages after splitting.
+ *
+ * We can use any remaining free space on the old page to store full
+ * segments, as well as the new page. Each full-sized segment can hold
+ * at least MinTuplesPerSegment items
+ */
+ int nnewsegments;
+
+ nnewsegments = freespace / GinPostingListSegmentMaxSize;
+ nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
+ maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
+ }
+
+ /* Add the new items to the segments */
+ if (!addItemsToLeaf(leaf, newItems, maxitems))
+ {
+ /* all items were duplicates, we have nothing to do */
+ items->curitem += maxitems;
+
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextDelete(tmpCxt);
+
+ return UNMODIFIED;
+ }
+
+ /*
+ * Pack the items back to compressed segments, ready for writing to disk.
+ */
+ needsplit = leafRepackItems(leaf, &remaining);
+
+ /*
+ * Did all the new items fit?
+ *
+ * If we're appending, it's OK if they didn't. But as a sanity check,
+ * verify that all the old items fit.
+ */
+ if (ItemPointerIsValid(&remaining))
+ {
+ if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
+ elog(ERROR, "could not split GIN page; all old items didn't fit");
+
+ /* Count how many of the new items did fit. */
+ for (i = 0; i < maxitems; i++)
+ {
+ if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
+ break;
+ }
+ if (i == 0)
+ elog(ERROR, "could not split GIN page; no new items fit");
+ maxitems = i;
+ }
+
+ if (!needsplit)
+ {
+ /*
+ * Great, all the items fit on a single page. Construct a WAL record
+ * describing the changes we made, and write the segments back to the
+ * page.
+ *
+ * Once we start modifying the page, there's no turning back. The
+ * caller is responsible for calling END_CRIT_SECTION() after writing
+ * the WAL record.
+ */
+ MemoryContextSwitchTo(oldCxt);
+ if (RelationNeedsWAL(btree->index))
+ *prdata = constructLeafRecompressWALData(buf, leaf);
+ else
+ *prdata = NULL;
+ START_CRIT_SECTION();
+ dataPlaceToPageLeafRecompress(buf, leaf);
+
+ if (append)
+ elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+ items->nitem - items->curitem - maxitems);
+ else
+ elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+ items->nitem - items->curitem - maxitems);
+ }
+ else
+ {
+ /*
+ * Had to split.
+ *
+ * We already divided the segments between the left and the right
+ * page. The left page was filled as full as possible, and the rest
+ * overflowed to the right page. When building a new index, that's
+ * good, because the table is scanned from beginning to end and there
+ * won't be any more insertions to the left page during the build.
+ * This packs the index as tight as possible. But otherwise, split
+ * 50/50, by moving segments from the left page to the right page
+ * until they're balanced.
+ *
+ * As a further heuristic, when appending items to the end of the
+ * page, split 75/25, one the assumption that subsequent insertions
+ * will probably also go to the end. This packs the index somewhat
+ * tighter when appending to a table, which is very common.
+ */
+ if (!btree->isBuild)
+ {
+ while (dlist_has_prev(&leaf->segments, leaf->lastleft))
+ {
+ lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+
+ segsize = SizeOfGinPostingList(lastleftinfo->seg);
+ if (append)
+ {
+ if ((leaf->lsize - segsize) - (leaf->lsize - segsize) < BLCKSZ / 4)
+ break;
+ }
+ else
+ {
+ if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
+ break;
+ }
+
+ leaf->lsize -= segsize;
+ leaf->rsize += segsize;
+ leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
+ }
+ }
+ Assert(leaf->lsize <= GinDataPageMaxDataSize);
+ Assert(leaf->rsize <= GinDataPageMaxDataSize);
- PostingItemSetBlockNumber(pitem, btree->rightblkno);
- ret = btree->rightblkno;
+ /*
+ * Fetch the max item in the left page's last segment; it becomes the
+ * right bound of the page.
+ */
+ lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+ if (!lastleftinfo->items)
+ lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+ &lastleftinfo->nitems);
+ lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
+
+ *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+ *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+
+ dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound,
+ prdata, *newlpage, *newrpage);
+
+ Assert(GinPageRightMost(page) ||
+ ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
+ GinDataPageGetRightBound(*newrpage)) < 0);
+
+ if (append)
+ elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+ items->nitem - items->curitem - maxitems);
+ else
+ elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+ items->nitem - items->curitem - maxitems);
}
- btree->rightblkno = InvalidBlockNumber;
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextDelete(tmpCxt);
+
+ items->curitem += maxitems;
- return ret;
+ return needsplit ? SPLIT : INSERTED;
}
/*
- * Places keys to page and fills WAL record. In case leaf page and
- * build mode puts all ItemPointers to page.
+ * Vacuum a posting tree leaf page.
*/
-static void
-dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
+void
+ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
{
- Page page = BufferGetPage(buf);
- int sizeofitem = GinSizeOfDataPageItem(page);
- int cnt = 0;
+ Page page = BufferGetPage(buffer);
+ disassembledLeaf *leaf;
+ bool removedsomething = false;
+ dlist_iter iter;
- /* these must be static so they can be returned to caller */
- static XLogRecData rdata[3];
- static ginxlogInsert data;
+ leaf = disassembleLeaf(page);
- *prdata = rdata;
- Assert(GinPageIsData(page));
-
- data.updateBlkno = dataPrepareData(btree, page, off);
+ /* Vacuum each segment. */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+ int oldsegsize;
+ ItemPointer cleaned;
+ int ncleaned;
+
+ if (!seginfo->items)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (seginfo->seg)
+ oldsegsize = SizeOfGinPostingList(seginfo->seg);
+ else
+ oldsegsize = GinDataPageMaxDataSize;
+
+ cleaned = ginVacuumItemPointers(gvs,
+ seginfo->items,
+ seginfo->nitems,
+ &ncleaned);
+ pfree(seginfo->items);
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ if (cleaned)
+ {
+ if (ncleaned > 0)
+ {
+ int npacked;
+
+ seginfo->seg = ginCompressPostingList(cleaned,
+ ncleaned,
+ oldsegsize,
+ &npacked);
+ /* Removing an item never increases the size of the segment */
+ if (npacked != ncleaned)
+ elog(ERROR, "could not fit vacuumed posting list");
+ seginfo->action = GIN_SEGMENT_REPLACE;
+ }
+ else
+ {
+ seginfo->seg = NULL;
+ seginfo->items = NULL;
+ seginfo->action = GIN_SEGMENT_DELETE;
+ }
+ seginfo->nitems = ncleaned;
- data.node = btree->index->rd_node;
- data.blkno = BufferGetBlockNumber(buf);
- data.offset = off;
- data.nitem = 1;
- data.isDelete = FALSE;
- data.isData = TRUE;
- data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
+ removedsomething = true;
+ }
+ }
/*
- * Prevent full page write if child's split occurs. That is needed to
- * remove incomplete splits while replaying WAL
+ * If we removed any items, reconstruct the page from the pieces.
*
- * data.updateBlkno contains new block number (of newly created right
- * page) for recently splited page.
+ * We don't try to re-encode the segments here, even though some of them
+ * might be really small now that we've removed some items from them. It
+ * seems like a waste of effort, as there isn't really any benefit from
+ * larger segments per se; larger segments only help to pack more items in
+ * the same space. We might as well delay doing that until the next
+ * insertion, which will need to re-encode at least part of the page
+ * anyway.
+ *
+ * Also note if the page was in uncompressed, pre-9.4 format before, it is
+ * now represented as one huge segment that contains all the items. It
+ * might make sense to split that, to speed up random access, but we don't
+ * bother. You'll have to REINDEX anyway if you want the full gain of the
+ * new tighter index format.
*/
- if (data.updateBlkno == InvalidBlockNumber)
+ if (removedsomething)
{
- rdata[0].buffer = buf;
- rdata[0].buffer_std = FALSE;
- rdata[0].data = NULL;
- rdata[0].len = 0;
- rdata[0].next = &rdata[1];
- cnt++;
+ XLogRecData *payloadrdata = NULL;
+ bool modified;
+
+ /*
+ * Make sure we have a palloc'd copy of all segments, after the first
+ * segment that is modified. (dataPlaceToPageLeafRecompress requires
+ * this).
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
+ if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ int segsize = SizeOfGinPostingList(seginfo->seg);
+ GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ if (RelationNeedsWAL(indexrel))
+ payloadrdata = constructLeafRecompressWALData(buffer, leaf);
+ START_CRIT_SECTION();
+ dataPlaceToPageLeafRecompress(buffer, leaf);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(indexrel))
+ {
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+ ginxlogVacuumDataLeafPage xlrec;
+
+ xlrec.node = indexrel->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(buffer);
+
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) &xlrec;
+ rdata.len = offsetof(ginxlogVacuumDataLeafPage, data);
+ rdata.next = payloadrdata;
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
}
+}
- rdata[cnt].buffer = InvalidBuffer;
- rdata[cnt].data = (char *) &data;
- rdata[cnt].len = sizeof(ginxlogInsert);
- rdata[cnt].next = &rdata[cnt + 1];
- cnt++;
+/*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf.
+ */
+static XLogRecData *
+constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
+{
+ int nmodified = 0;
+ char *walbufbegin;
+ char *walbufend;
+ XLogRecData *rdata;
+ dlist_iter iter;
+ int segno;
+ ginxlogRecompressDataLeaf *recompress_xlog;
+
+ /* Count the modified segments */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
- rdata[cnt].buffer = InvalidBuffer;
- rdata[cnt].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem));
- rdata[cnt].len = sizeofitem;
- rdata[cnt].next = NULL;
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ nmodified++;
+ }
- if (GinPageIsLeaf(page))
+ walbufbegin = palloc(
+ sizeof(ginxlogRecompressDataLeaf) +
+ BLCKSZ + /* max size needed to hold the segment
+ * data */
+ nmodified * 2 + /* (segno + action) per action */
+ sizeof(XLogRecData));
+ walbufend = walbufbegin;
+
+ recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+ walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+ recompress_xlog->nactions = nmodified;
+
+ segno = 0;
+ dlist_foreach(iter, &leaf->segments)
{
- if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+ int segsize = 0;
+ int datalen;
+ uint8 action = seginfo->action;
+
+ if (action == GIN_SEGMENT_UNMODIFIED)
{
- /* usually, create index... */
- uint32 savedPos = btree->curitem;
+ segno++;
+ continue;
+ }
- while (btree->curitem < btree->nitem)
- {
- GinDataPageAddItem(page, btree->items + btree->curitem, off);
- off++;
- btree->curitem++;
- }
- data.nitem = btree->curitem - savedPos;
- rdata[cnt].len = sizeofitem * data.nitem;
+ if (action != GIN_SEGMENT_DELETE)
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ /*
+ * If storing the uncompressed list of added item pointers would take
+ * more space than storing the compressed segment as is, do that
+ * instead.
+ */
+ if (action == GIN_SEGMENT_ADDITEMS &&
+ seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+ {
+ action = GIN_SEGMENT_REPLACE;
}
- else
+
+ *((uint8 *) (walbufend++)) = segno;
+ *(walbufend++) = action;
+
+ switch (action)
{
- GinDataPageAddItem(page, btree->items + btree->curitem, off);
- btree->curitem++;
+ case GIN_SEGMENT_DELETE:
+ datalen = 0;
+ break;
+
+ case GIN_SEGMENT_ADDITEMS:
+ datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+ memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
+ memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+ datalen += sizeof(uint16);
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ case GIN_SEGMENT_REPLACE:
+ datalen = SHORTALIGN(segsize);
+ memcpy(walbufend, seginfo->seg, segsize);
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action %d", action);
}
+ walbufend += datalen;
+
+ if (action != GIN_SEGMENT_INSERT)
+ segno++;
}
- else
- GinDataPageAddItem(page, &(btree->pitem), off);
+
+ rdata = (XLogRecData *) MAXALIGN(walbufend);
+ rdata->buffer = buf;
+ rdata->buffer_std = TRUE;
+ rdata->data = walbufbegin;
+ rdata->len = walbufend - walbufbegin;
+ rdata->next = NULL;
+
+ return rdata;
}
/*
- * split page and fills WAL record. original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data. In leaf page and build mode puts all
- * ItemPointers to pages. Also, in build mode splits data by way to full fulled
- * left page
+ * Assemble a disassembled posting tree leaf page back to a buffer.
+ *
+ * *prdata is filled with WAL information about this operation. The caller
+ * is responsible for inserting to the WAL, along with any other information
+ * about the operation that triggered this recompression.
+ *
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified and whose preceding
+ * segments have not been modified either.
*/
-static Page
-dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
+static void
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
{
+ Page page = BufferGetPage(buf);
char *ptr;
- OffsetNumber separator;
- ItemPointer bound;
- Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
- ItemPointerData oldbound = *GinDataPageGetRightBound(lpage);
- int sizeofitem = GinSizeOfDataPageItem(lpage);
- OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff;
- Page rpage = BufferGetPage(rbuf);
- Size pageSize = PageGetPageSize(lpage);
- Size freeSpace;
- uint32 nCopied = 1;
+ int newsize;
+ bool modified = false;
+ dlist_iter iter;
+ int segsize;
- /* these must be static so they can be returned to caller */
- static ginxlogSplit data;
- static XLogRecData rdata[4];
- static char vector[2 * BLCKSZ];
+ /*
+ * If the page was in pre-9.4 format before, convert the header, and force
+ * all segments to be copied to the page whether they were modified or
+ * not.
+ */
+ if (!GinPageIsCompressed(page))
+ {
+ Assert(leaf->oldformat);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ modified = true;
+ }
+
+ ptr = (char *) GinDataLeafPageGetPostingList(page);
+ newsize = 0;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
- GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
- freeSpace = GinDataPageGetFreeSpace(rpage);
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
- *prdata = rdata;
- data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
- InvalidOffsetNumber : PostingItemGetBlockNumber(&(btree->pitem));
- data.updateBlkno = dataPrepareData(btree, lpage, off);
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ if (modified)
+ memcpy(ptr, seginfo->seg, segsize);
- memcpy(vector, GinDataPageGetItem(lpage, FirstOffsetNumber),
- maxoff * sizeofitem);
+ ptr += segsize;
+ newsize += segsize;
+ }
+ }
+
+ Assert(newsize <= GinDataPageMaxDataSize);
+ GinDataPageSetDataSize(page, newsize);
+}
- if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
+/*
+ * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
+ * segments to two pages instead of one.
+ *
+ * This is different from the non-split cases in that this does not modify
+ * the original page directly, but to temporary in-memory copies of the new
+ * left and right pages.
+ */
+static void
+dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
+ ItemPointerData lbound, ItemPointerData rbound,
+ XLogRecData **prdata, Page lpage, Page rpage)
+{
+ char *ptr;
+ int segsize;
+ int lsize;
+ int rsize;
+ dlist_node *node;
+ dlist_node *firstright;
+ leafSegmentInfo *seginfo;
+
+ /* these must be static so they can be returned to caller */
+ static ginxlogSplitDataLeaf split_xlog;
+ static XLogRecData rdata[3];
+
+ /* Initialize temporary pages to hold the new left and right pages */
+ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+ GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+
+ /*
+ * Copy the segments that go to the left page.
+ *
+ * XXX: We should skip copying the unmodified part of the left page, like
+ * we do when recompressing.
+ */
+ lsize = 0;
+ ptr = (char *) GinDataLeafPageGetPostingList(lpage);
+ firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
+ for (node = dlist_head_node(&leaf->segments);
+ node != firstright;
+ node = dlist_next_node(&leaf->segments, node))
{
- nCopied = 0;
- while (btree->curitem < btree->nitem &&
- maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
+ seginfo = dlist_container(leafSegmentInfo, node, node);
+
+ if (seginfo->action != GIN_SEGMENT_DELETE)
{
- memcpy(vector + maxoff * sizeof(ItemPointerData),
- btree->items + btree->curitem,
- sizeof(ItemPointerData));
- maxoff++;
- nCopied++;
- btree->curitem++;
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ lsize += segsize;
}
}
- else
+ Assert(lsize == leaf->lsize);
+ GinDataPageSetDataSize(lpage, lsize);
+ *GinDataPageGetRightBound(lpage) = lbound;
+
+ /* Copy the segments that go to the right page */
+ ptr = (char *) GinDataLeafPageGetPostingList(rpage);
+ rsize = 0;
+ for (node = firstright;
+ ;
+ node = dlist_next_node(&leaf->segments, node))
{
- ptr = vector + (off - 1) * sizeofitem;
- if (maxoff + 1 - off != 0)
- memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem);
- if (GinPageIsLeaf(lpage))
+ seginfo = dlist_container(leafSegmentInfo, node, node);
+
+ if (seginfo->action != GIN_SEGMENT_DELETE)
{
- memcpy(ptr, btree->items + btree->curitem, sizeofitem);
- btree->curitem++;
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ rsize += segsize;
}
- else
- memcpy(ptr, &(btree->pitem), sizeofitem);
- maxoff++;
+ if (!dlist_has_next(&leaf->segments, node))
+ break;
+ }
+ Assert(rsize == leaf->rsize);
+ GinDataPageSetDataSize(rpage, rsize);
+ *GinDataPageGetRightBound(rpage) = rbound;
+
+ /* Create WAL record */
+ split_xlog.lsize = lsize;
+ split_xlog.rsize = rsize;
+ split_xlog.lrightbound = lbound;
+ split_xlog.rrightbound = rbound;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &split_xlog;
+ rdata[0].len = sizeof(ginxlogSplitDataLeaf);
+ rdata[0].next = &rdata[1];
+
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage);
+ rdata[1].len = lsize;
+ rdata[1].next = &rdata[2];
+
+ rdata[2].buffer = InvalidBuffer;
+ rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage);
+ rdata[2].len = rsize;
+ rdata[2].next = NULL;
+
+ *prdata = rdata;
+}
+
+/*
+ * Place a PostingItem to page, and fill a WAL record.
+ *
+ * If the item doesn't fit, returns false without modifying the page.
+ *
+ * In addition to inserting the given item, the downlink of the existing item
+ * at 'off' is updated to point to 'updateblkno'.
+ */
+static GinPlaceToPageRC
+dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+ Page page = BufferGetPage(buf);
+ OffsetNumber off = stack->off;
+ PostingItem *pitem;
+
+ /* these must be static so they can be returned to caller */
+ static XLogRecData rdata;
+ static ginxlogInsertDataInternal data;
+
+ /* split if we have to */
+ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
+ {
+ dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
+ prdata, newlpage, newrpage);
+ return SPLIT;
}
+ *prdata = &rdata;
+ Assert(GinPageIsData(page));
+
+ START_CRIT_SECTION();
+
+ /* Update existing downlink to point to next page (on internal page) */
+ pitem = GinDataPageGetPostingItem(page, off);
+ PostingItemSetBlockNumber(pitem, updateblkno);
+
+ /* Add new item */
+ pitem = (PostingItem *) insertdata;
+ GinDataPageAddPostingItem(page, pitem, off);
+
+ data.offset = off;
+ data.newitem = *pitem;
+
+ rdata.buffer = buf;
+ rdata.buffer_std = TRUE;
+ rdata.data = (char *) &data;
+ rdata.len = sizeof(ginxlogInsertDataInternal);
+ rdata.next = NULL;
+
+ return INSERTED;
+}
+
+/*
+ * Places an item (or items) to a posting tree. Calls relevant function of
+ * internal of leaf page because they are handled very differently.
+ */
+static GinPlaceToPageRC
+dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ XLogRecData **prdata,
+ Page *newlpage, Page *newrpage)
+{
+ Page page = BufferGetPage(buf);
+
+ Assert(GinPageIsData(page));
+
+ if (GinPageIsLeaf(page))
+ return dataPlaceToPageLeaf(btree, buf, stack, insertdata,
+ prdata, newlpage, newrpage);
+ else
+ return dataPlaceToPageInternal(btree, buf, stack,
+ insertdata, updateblkno,
+ prdata, newlpage, newrpage);
+}
+
+/*
+ * Split page and fill WAL record. Returns a new temp buffer filled with data
+ * that should go to the left page. The original buffer is left untouched.
+ */
+static void
+dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+ Page oldpage = BufferGetPage(origbuf);
+ OffsetNumber off = stack->off;
+ int nitems = GinPageGetOpaque(oldpage)->maxoff;
+ int nleftitems;
+ int nrightitems;
+ Size pageSize = PageGetPageSize(oldpage);
+ ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
+ ItemPointer bound;
+ Page lpage;
+ Page rpage;
+ OffsetNumber separator;
+
+ /* these must be static so they can be returned to caller */
+ static ginxlogSplitDataInternal data;
+ static XLogRecData rdata[4];
+ static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
+
+ lpage = PageGetTempPage(oldpage);
+ rpage = PageGetTempPage(oldpage);
+ GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+ GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+
+ *prdata = rdata;
+
/*
- * we suppose that during index creation table scaned from begin to end,
- * so ItemPointers are monotonically increased..
+ * First construct a new list of PostingItems, which includes all the old
+ * items, and the new item.
*/
- if (btree->isBuild && GinPageRightMost(lpage))
- separator = freeSpace / sizeofitem;
- else
- separator = maxoff / 2;
+ memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
+ (off - 1) * sizeof(PostingItem));
- GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
- GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
+ allitems[off - 1] = *((PostingItem *) insertdata);
+ memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
+ (nitems - (off - 1)) * sizeof(PostingItem));
+ nitems++;
- memcpy(GinDataPageGetItem(lpage, FirstOffsetNumber), vector, separator * sizeofitem);
- GinPageGetOpaque(lpage)->maxoff = separator;
- memcpy(GinDataPageGetItem(rpage, FirstOffsetNumber),
- vector + separator * sizeofitem, (maxoff - separator) * sizeofitem);
- GinPageGetOpaque(rpage)->maxoff = maxoff - separator;
+ /* Update existing downlink to point to next page */
+ PostingItemSetBlockNumber(&allitems[off], updateblkno);
- PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf));
- if (GinPageIsLeaf(lpage))
- btree->pitem.key = *(ItemPointerData *) GinDataPageGetItem(lpage,
- GinPageGetOpaque(lpage)->maxoff);
+ /*
+ * When creating a new index, fit as many tuples as possible on the left
+ * page, on the assumption that the table is scanned from beginning to
+ * end. This packs the index as tight as possible.
+ */
+ if (btree->isBuild && GinPageRightMost(oldpage))
+ separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
else
- btree->pitem.key = ((PostingItem *) GinDataPageGetItem(lpage,
- GinPageGetOpaque(lpage)->maxoff))->key;
- btree->rightblkno = BufferGetBlockNumber(rbuf);
+ separator = nitems / 2;
+ nleftitems = separator;
+ nrightitems = nitems - separator;
+
+ memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
+ allitems,
+ nleftitems * sizeof(PostingItem));
+ GinPageGetOpaque(lpage)->maxoff = nleftitems;
+ memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
+ &allitems[separator],
+ nrightitems * sizeof(PostingItem));
+ GinPageGetOpaque(rpage)->maxoff = nrightitems;
+
+ /*
+ * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
+ */
+ GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
+ GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
/* set up right bound for left page */
bound = GinDataPageGetRightBound(lpage);
- *bound = btree->pitem.key;
+ *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
/* set up right bound for right page */
- bound = GinDataPageGetRightBound(rpage);
- *bound = oldbound;
+ *GinDataPageGetRightBound(rpage) = oldbound;
- data.node = btree->index->rd_node;
- data.rootBlkno = InvalidBlockNumber;
- data.lblkno = BufferGetBlockNumber(lbuf);
- data.rblkno = BufferGetBlockNumber(rbuf);
data.separator = separator;
- data.nitem = maxoff;
- data.isData = TRUE;
- data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
- data.isRootSplit = FALSE;
+ data.nitem = nitems;
data.rightbound = oldbound;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
- rdata[0].len = sizeof(ginxlogSplit);
+ rdata[0].len = sizeof(ginxlogSplitDataInternal);
rdata[0].next = &rdata[1];
rdata[1].buffer = InvalidBuffer;
- rdata[1].data = vector;
- rdata[1].len = MAXALIGN(maxoff * sizeofitem);
+ rdata[1].data = (char *) allitems;
+ rdata[1].len = nitems * sizeof(PostingItem);
rdata[1].next = NULL;
- return lpage;
+ *newlpage = lpage;
+ *newrpage = rpage;
+}
+
+/*
+ * Construct insertion payload for inserting the downlink for given buffer.
+ */
+static void *
+dataPrepareDownlink(GinBtree btree, Buffer lbuf)
+{
+ PostingItem *pitem = palloc(sizeof(PostingItem));
+ Page lpage = BufferGetPage(lbuf);
+
+ PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
+ pitem->key = *GinDataPageGetRightBound(lpage);
+
+ return pitem;
}
/*
@@ -585,102 +1306,596 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe
* Also called from ginxlog, should not use btree
*/
void
-ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
{
- Page page = BufferGetPage(root),
- lpage = BufferGetPage(lbuf),
- rpage = BufferGetPage(rbuf);
PostingItem li,
ri;
li.key = *GinDataPageGetRightBound(lpage);
- PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf));
- GinDataPageAddItem(page, &li, InvalidOffsetNumber);
+ PostingItemSetBlockNumber(&li, lblkno);
+ GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
ri.key = *GinDataPageGetRightBound(rpage);
- PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf));
- GinDataPageAddItem(page, &ri, InvalidOffsetNumber);
+ PostingItemSetBlockNumber(&ri, rblkno);
+ GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
+}
+
+
+/*** Functions to work with disassembled leaf pages ***/
+
+/*
+ * Disassemble page into a disassembledLeaf struct.
+ */
+static disassembledLeaf *
+disassembleLeaf(Page page)
+{
+ disassembledLeaf *leaf;
+ GinPostingList *seg;
+ Pointer segbegin;
+ Pointer segend;
+
+ leaf = palloc0(sizeof(disassembledLeaf));
+ dlist_init(&leaf->segments);
+
+ if (GinPageIsCompressed(page))
+ {
+ /*
+ * Create a leafSegment entry for each segment.
+ */
+ seg = GinDataLeafPageGetPostingList(page);
+ segbegin = (Pointer) seg;
+ segend = segbegin + GinDataLeafPageGetPostingListSize(page);
+ while ((Pointer) seg < segend)
+ {
+ leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+
+ seginfo->action = GIN_SEGMENT_UNMODIFIED;
+ seginfo->seg = seg;
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ dlist_push_tail(&leaf->segments, &seginfo->node);
+
+ seg = GinNextPostingListSegment(seg);
+ }
+ leaf->oldformat = false;
+ }
+ else
+ {
+ /*
+ * A pre-9.4 format uncompressed page is represented by a single
+ * segment, with an array of items.
+ */
+ ItemPointer uncompressed;
+ int nuncompressed;
+ leafSegmentInfo *seginfo;
+
+ uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
+
+ seginfo = palloc(sizeof(leafSegmentInfo));
+
+ seginfo->action = GIN_SEGMENT_REPLACE;
+ seginfo->seg = NULL;
+ seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
+ memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
+ seginfo->nitems = nuncompressed;
+
+ dlist_push_tail(&leaf->segments, &seginfo->node);
+
+ leaf->oldformat = true;
+ }
+
+ return leaf;
+}
+
+/*
+ * Distribute newItems to the segments.
+ *
+ * Any segments that acquire new items are decoded, and the new items are
+ * merged with the old items.
+ *
+ * Returns true if any new items were added. False means they were all
+ * duplicates of existing items on the page.
+ */
+static bool
+addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
+{
+ dlist_iter iter;
+ ItemPointer nextnew = newItems;
+ int newleft = nNewItems;
+ bool modified = false;
+ leafSegmentInfo *newseg;
+
+ /*
+ * If the page is completely empty, just construct one new segment to hold
+ * all the new items.
+ */
+ if (dlist_is_empty(&leaf->segments))
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = newItems;
+ newseg->nitems = nNewItems;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ return true;
+ }
+
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
+ int nthis;
+ ItemPointer tmpitems;
+ int ntmpitems;
+
+ /*
+ * How many of the new items fall into this segment?
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur))
+ nthis = newleft;
+ else
+ {
+ leafSegmentInfo *next;
+ ItemPointerData next_first;
+
+ next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
+ dlist_next_node(&leaf->segments, iter.cur));
+ if (next->items)
+ next_first = next->items[0];
+ else
+ {
+ Assert(next->seg != NULL);
+ next_first = next->seg->first;
+ }
+
+ nthis = 0;
+ while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
+ nthis++;
+ }
+ if (nthis == 0)
+ continue;
+
+ /* Merge the new items with the existing items. */
+ if (!cur->items)
+ cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+
+ /*
+ * Fast path for the important special case that we're appending to
+ * the end of the page: don't let the last segment on the page grow
+ * larger than the target, create a new segment before that happens.
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur) &&
+ ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
+ cur->seg != NULL &&
+ SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = nextnew;
+ newseg->nitems = nthis;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ modified = true;
+ break;
+ }
+
+ tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
+ nextnew, nthis,
+ &ntmpitems);
+ if (ntmpitems != cur->nitems)
+ {
+ /*
+ * If there are no duplicates, track the added items so that we
+ * can emit a compact ADDITEMS WAL record later on. (it doesn't
+ * seem worth re-checking which items were duplicates, if there
+ * were any)
+ */
+ if (ntmpitems == nthis + cur->nitems &&
+ cur->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ cur->action = GIN_SEGMENT_ADDITEMS;
+ cur->modifieditems = nextnew;
+ cur->nmodifieditems = nthis;
+ }
+ else
+ cur->action = GIN_SEGMENT_REPLACE;
+
+ cur->items = tmpitems;
+ cur->nitems = ntmpitems;
+ cur->seg = NULL;
+ modified = true;
+ }
+
+ nextnew += nthis;
+ newleft -= nthis;
+ if (newleft == 0)
+ break;
+ }
+
+ return modified;
+}
+
+/*
+ * Recompresses all segments that have been modified.
+ *
+ * If not all the items fit on two pages (ie. after split), we store as
+ * many items as fit, and set *remaining to the first item that didn't fit.
+ * If all items fit, *remaining is set to invalid.
+ *
+ * Returns true if the page has to be split.
+ */
+static bool
+leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
+{
+ int pgused = 0;
+ bool needsplit = false;
+ dlist_iter iter;
+ int segsize;
+ leafSegmentInfo *nextseg;
+ int npacked;
+ bool modified;
+ dlist_node *cur_node;
+ dlist_node *next_node;
+
+ ItemPointerSetInvalid(remaining);
+
+ /*
+ * cannot use dlist_foreach_modify here because we insert adjacent items
+ * while iterating.
+ */
+ for (cur_node = dlist_head_node(&leaf->segments);
+ cur_node != NULL;
+ cur_node = next_node)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ cur_node);
+
+ if (dlist_has_next(&leaf->segments, cur_node))
+ next_node = dlist_next_node(&leaf->segments, cur_node);
+ else
+ next_node = NULL;
+
+ /* Compress the posting list, if necessary */
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ if (seginfo->seg == NULL)
+ {
+ if (seginfo->nitems > GinPostingListSegmentMaxSize)
+ npacked = 0; /* no chance that it would fit. */
+ else
+ {
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ }
+ if (npacked != seginfo->nitems)
+ {
+ /*
+ * Too large. Compress again to the target size, and
+ * create a new segment to represent the remaining items.
+ * The new segment is inserted after this one, so it will
+ * be processed in the next iteration of this loop.
+ */
+ if (seginfo->seg)
+ pfree(seginfo->seg);
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentTargetSize,
+ &npacked);
+ if (seginfo->action != GIN_SEGMENT_INSERT)
+ seginfo->action = GIN_SEGMENT_REPLACE;
+
+ nextseg = palloc(sizeof(leafSegmentInfo));
+ nextseg->action = GIN_SEGMENT_INSERT;
+ nextseg->seg = NULL;
+ nextseg->items = &seginfo->items[npacked];
+ nextseg->nitems = seginfo->nitems - npacked;
+ next_node = &nextseg->node;
+ dlist_insert_after(cur_node, next_node);
+ }
+ }
+
+ /*
+ * If the segment is very small, merge it with the next segment.
+ */
+ if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+ {
+ int nmerged;
+
+ nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+ if (seginfo->items == NULL)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (nextseg->items == NULL)
+ nextseg->items = ginPostingListDecode(nextseg->seg,
+ &nextseg->nitems);
+ nextseg->items =
+ ginMergeItemPointers(seginfo->items, seginfo->nitems,
+ nextseg->items, nextseg->nitems,
+ &nmerged);
+ Assert(nmerged == seginfo->nitems + nextseg->nitems);
+ nextseg->nitems = nmerged;
+ nextseg->seg = NULL;
+
+ nextseg->action = GIN_SEGMENT_REPLACE;
+ nextseg->modifieditems = NULL;
+ nextseg->nmodifieditems = 0;
+
+ if (seginfo->action == GIN_SEGMENT_INSERT)
+ {
+ dlist_delete(cur_node);
+ continue;
+ }
+ else
+ {
+ seginfo->action = GIN_SEGMENT_DELETE;
+ seginfo->seg = NULL;
+ }
+ }
+
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ }
+
+ if (seginfo->action == GIN_SEGMENT_DELETE)
+ continue;
+
+ /*
+ * OK, we now have a compressed version of this segment ready for
+ * copying to the page. Did we exceed the size that fits on one page?
+ */
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ if (pgused + segsize > GinDataPageMaxDataSize)
+ {
+ if (!needsplit)
+ {
+ /* switch to right page */
+ Assert(pgused > 0);
+ leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
+ needsplit = true;
+ leaf->lsize = pgused;
+ pgused = 0;
+ }
+ else
+ {
+ /*
+ * Filled both pages. The last segment we constructed did not
+ * fit.
+ */
+ *remaining = seginfo->seg->first;
+
+ /*
+ * remove all segments that did not fit from the list.
+ */
+ while (dlist_has_next(&leaf->segments, cur_node))
+ dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+ dlist_delete(cur_node);
+ break;
+ }
+ }
+
+ pgused += segsize;
+ }
+
+ if (!needsplit)
+ {
+ leaf->lsize = pgused;
+ leaf->rsize = 0;
+ }
+ else
+ leaf->rsize = pgused;
+
+ Assert(leaf->lsize <= GinDataPageMaxDataSize);
+ Assert(leaf->rsize <= GinDataPageMaxDataSize);
+
+ /*
+ * Make a palloc'd copy of every segment after the first modified one,
+ * because as we start copying items to the original page, we might
+ * overwrite an existing segment.
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ {
+ modified = true;
+ }
+ else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ GinPostingList *tmp;
+
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ tmp = palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ return needsplit;
+}
+
+
+/*** Functions that are exported to the rest of the GIN code ***/
+
+/*
+ * Creates new posting tree containing the given TIDs. Returns the page
+ * number of the root of the new posting tree.
+ *
+ * items[] must be in sorted order with no duplicates.
+ */
+BlockNumber
+createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
+ GinStatsData *buildStats)
+{
+ BlockNumber blkno;
+ Buffer buffer;
+ Page tmppage;
+ Page page;
+ Pointer ptr;
+ int nrootitems;
+ int rootsize;
+
+ /* Construct the new root page in memory first. */
+ tmppage = (Page) palloc(BLCKSZ);
+ GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+ GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
+
+ /*
+ * Write as many of the items to the root page as fit. In segments of max
+ * GinPostingListSegmentMaxSize bytes each.
+ */
+ nrootitems = 0;
+ rootsize = 0;
+ ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
+ while (nrootitems < nitems)
+ {
+ GinPostingList *segment;
+ int npacked;
+ int segsize;
+
+ segment = ginCompressPostingList(&items[nrootitems],
+ nitems - nrootitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ segsize = SizeOfGinPostingList(segment);
+ if (rootsize + segsize > GinDataPageMaxDataSize)
+ break;
+
+ memcpy(ptr, segment, segsize);
+ ptr += segsize;
+ rootsize += segsize;
+ nrootitems += npacked;
+ pfree(segment);
+ }
+ GinDataPageSetDataSize(tmppage, rootsize);
+
+ /*
+ * All set. Get a new physical page, and copy the in-memory page to it.
+ */
+ buffer = GinNewBuffer(index);
+ page = BufferGetPage(buffer);
+ blkno = BufferGetBlockNumber(buffer);
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(tmppage, page);
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+ ginxlogCreatePostingTree data;
+
+ data.node = index->rd_node;
+ data.blkno = blkno;
+ data.size = rootsize;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &data;
+ rdata[0].len = sizeof(ginxlogCreatePostingTree);
+ rdata[0].next = &rdata[1];
+
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].data = (char *) GinDataLeafPageGetPostingList(page);
+ rdata[1].len = rootsize;
+ rdata[1].next = NULL;
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata);
+ PageSetLSN(page, recptr);
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ /* During index build, count the newly-added data page */
+ if (buildStats)
+ buildStats->nDataPages++;
+
+ elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
+
+ /*
+ * Add any remaining TIDs to the newly-created posting tree.
+ */
+ if (nitems > nrootitems)
+ {
+ ginInsertItemPointers(index, blkno,
+ items + nrootitems,
+ nitems - nrootitems,
+ buildStats);
+ }
+
+ return blkno;
}
void
-ginPrepareDataScan(GinBtree btree, Relation index)
+ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
{
memset(btree, 0, sizeof(GinBtreeData));
btree->index = index;
+ btree->rootBlkno = rootBlkno;
btree->findChildPage = dataLocateItem;
+ btree->getLeftMostChild = dataGetLeftMostPage;
btree->isMoveRight = dataIsMoveRight;
- btree->findItem = dataLocateLeafItem;
+ btree->findItem = NULL;
btree->findChildPtr = dataFindChildPtr;
- btree->getLeftMostPage = dataGetLeftMostPage;
- btree->isEnoughSpace = dataIsEnoughSpace;
btree->placeToPage = dataPlaceToPage;
- btree->splitPage = dataSplitPage;
btree->fillRoot = ginDataFillRoot;
+ btree->prepareDownlink = dataPrepareDownlink;
btree->isData = TRUE;
- btree->searchMode = FALSE;
- btree->isDelete = FALSE;
btree->fullScan = FALSE;
btree->isBuild = FALSE;
}
-GinPostingTreeScan *
-ginPrepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
-{
- GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan));
-
- ginPrepareDataScan(&gdi->btree, index);
-
- gdi->btree.searchMode = searchMode;
- gdi->btree.fullScan = searchMode;
-
- gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
-
- return gdi;
-}
-
/*
* Inserts array of item pointers, may execute several tree scan (very rare)
*/
void
-ginInsertItemPointers(GinPostingTreeScan *gdi,
+ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
ItemPointerData *items, uint32 nitem,
GinStatsData *buildStats)
{
- BlockNumber rootBlkno = gdi->stack->blkno;
+ GinBtreeData btree;
+ GinBtreeDataLeafInsertData insertdata;
+ GinBtreeStack *stack;
- gdi->btree.items = items;
- gdi->btree.nitem = nitem;
- gdi->btree.curitem = 0;
+ ginPrepareDataScan(&btree, index, rootBlkno);
+ btree.isBuild = (buildStats != NULL);
+ insertdata.items = items;
+ insertdata.nitem = nitem;
+ insertdata.curitem = 0;
- while (gdi->btree.curitem < gdi->btree.nitem)
+ while (insertdata.curitem < insertdata.nitem)
{
- if (!gdi->stack)
- gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
-
- gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
-
- if (gdi->btree.findItem(&(gdi->btree), gdi->stack))
- {
- /*
- * gdi->btree.items[gdi->btree.curitem] already exists in index
- */
- gdi->btree.curitem++;
- LockBuffer(gdi->stack->buffer, GIN_UNLOCK);
- freeGinBtreeStack(gdi->stack);
- }
- else
- ginInsertValue(&(gdi->btree), gdi->stack, buildStats);
+ /* search for the leaf page where the first item should go to */
+ btree.itemptr = insertdata.items[insertdata.curitem];
+ stack = ginFindLeafPage(&btree, false);
- gdi->stack = NULL;
+ ginInsertValue(&btree, stack, &insertdata, buildStats);
}
}
-Buffer
-ginScanBeginPostingTree(GinPostingTreeScan *gdi)
+/*
+ * Starts a new scan on a posting tree.
+ */
+GinBtreeStack *
+ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
{
- gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
- return gdi->stack->buffer;
+ GinBtreeStack *stack;
+
+ ginPrepareDataScan(btree, index, rootBlkno);
+
+ btree->fullScan = TRUE;
+
+ stack = ginFindLeafPage(btree, TRUE);
+
+ return stack;
}
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index 70fcddfe40..412f90da4d 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -1,10 +1,10 @@
/*-------------------------------------------------------------------------
*
* ginentrypage.c
- * page utilities routines for the postgres inverted index access method.
+ * routines for handling GIN entry tree pages.
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -15,8 +15,15 @@
#include "postgres.h"
#include "access/gin_private.h"
+#include "miscadmin.h"
#include "utils/rel.h"
+static void entrySplitPage(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertPayload,
+ BlockNumber updateblkno, XLogRecData **prdata,
+ Page *newlpage, Page *newrpage);
+
/*
* Form a tuple for entry tree.
*
@@ -27,15 +34,15 @@
* format that is being built here. We build on the assumption that we
* are making a leaf-level key entry containing a posting list of nipd items.
* If the caller is actually trying to make a posting-tree entry, non-leaf
- * entry, or pending-list entry, it should pass nipd = 0 and then overwrite
- * the t_tid fields as necessary. In any case, ipd can be NULL to skip
- * copying any itempointers into the posting list; the caller is responsible
- * for filling the posting list afterwards, if ipd = NULL and nipd > 0.
+ * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
+ * the t_tid fields as necessary. In any case, 'data' can be NULL to skip
+ * filling in the posting list; the caller is responsible for filling it
+ * afterwards if data = NULL and nipd > 0.
*/
IndexTuple
GinFormTuple(GinState *ginstate,
OffsetNumber attnum, Datum key, GinNullCategory category,
- ItemPointerData *ipd, uint32 nipd,
+ Pointer data, Size dataSize, int nipd,
bool errorTooBig)
{
Datum datums[2];
@@ -83,24 +90,23 @@ GinFormTuple(GinState *ginstate,
newsize = SHORTALIGN(newsize);
GinSetPostingOffset(itup, newsize);
-
GinSetNPosting(itup, nipd);
/*
* Add space needed for posting list, if any. Then check that the tuple
* won't be too big to store.
*/
- newsize += sizeof(ItemPointerData) * nipd;
+ newsize += dataSize;
+
newsize = MAXALIGN(newsize);
- if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
+
+ if (newsize > GinMaxItemSize)
{
if (errorTooBig)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
- (unsigned long) newsize,
- (unsigned long) Min(INDEX_SIZE_MASK,
- GinMaxItemSize),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ (Size) newsize, (Size) GinMaxItemSize,
RelationGetRelationName(ginstate->index))));
pfree(itup);
return NULL;
@@ -113,12 +119,28 @@ GinFormTuple(GinState *ginstate,
{
itup = repalloc(itup, newsize);
+ /*
+ * PostgreSQL 9.3 and earlier did not clear this new space, so we
+ * might find uninitialized padding when reading tuples from disk.
+ */
+ memset((char *) itup + IndexTupleSize(itup),
+ 0, newsize - IndexTupleSize(itup));
/* set new size in tuple header */
itup->t_info &= ~INDEX_SIZE_MASK;
itup->t_info |= newsize;
}
/*
+ * Copy in the posting list, if provided
+ */
+ if (data)
+ {
+ char *ptr = GinGetPosting(itup);
+
+ memcpy(ptr, data, dataSize);
+ }
+
+ /*
* Insert category byte, if needed
*/
if (category != GIN_CAT_NORM_KEY)
@@ -126,44 +148,52 @@ GinFormTuple(GinState *ginstate,
Assert(IndexTupleHasNulls(itup));
GinSetNullCategory(itup, ginstate, category);
}
-
- /*
- * Copy in the posting list, if provided
- */
- if (ipd)
- memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
-
return itup;
}
/*
- * Sometimes we reduce the number of posting list items in a tuple after
- * having built it with GinFormTuple. This function adjusts the size
- * fields to match.
+ * Read item pointers from leaf entry tuple.
+ *
+ * Returns a palloc'd array of ItemPointers. The number of items is returned
+ * in *nitems.
*/
-void
-GinShortenTuple(IndexTuple itup, uint32 nipd)
+ItemPointer
+ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
+ int *nitems)
{
- uint32 newsize;
-
- Assert(nipd <= GinGetNPosting(itup));
-
- newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
- newsize = MAXALIGN(newsize);
+ Pointer ptr = GinGetPosting(itup);
+ int nipd = GinGetNPosting(itup);
+ ItemPointer ipd;
+ int ndecoded;
- Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
-
- itup->t_info &= ~INDEX_SIZE_MASK;
- itup->t_info |= newsize;
-
- GinSetNPosting(itup, nipd);
+ if (GinItupIsCompressed(itup))
+ {
+ if (nipd > 0)
+ {
+ ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
+ if (nipd != ndecoded)
+ elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
+ nipd, ndecoded);
+ }
+ else
+ {
+ ipd = palloc(0);
+ }
+ }
+ else
+ {
+ ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
+ memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
+ }
+ *nitems = nipd;
+ return ipd;
}
/*
* Form a non-leaf entry tuple by copying the key data from the given tuple,
* which can be either a leaf or non-leaf entry tuple.
*
- * Any posting list in the source tuple is not copied. The specified child
+ * Any posting list in the source tuple is not copied. The specified child
* block number is inserted into t_tid.
*/
static IndexTuple
@@ -252,7 +282,7 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
{
stack->off = FirstOffsetNumber;
stack->predictNumber *= PageGetMaxOffsetNumber(page);
- return btree->getLeftMostPage(btree, page);
+ return btree->getLeftMostChild(btree, page);
}
low = FirstOffsetNumber;
@@ -425,22 +455,26 @@ entryGetLeftMostPage(GinBtree btree, Page page)
}
static bool
-entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
+entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
+ GinBtreeEntryInsertData *insertData)
{
- Size itupsz = 0;
+ Size releasedsz = 0;
+ Size addedsz;
Page page = BufferGetPage(buf);
- Assert(btree->entry);
+ Assert(insertData->entry);
Assert(!GinPageIsData(page));
- if (btree->isDelete)
+ if (insertData->isDelete)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
- itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+ releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
}
- if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
+ addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);
+
+ if (PageGetFreeSpace(page) + releasedsz >= addedsz)
return true;
return false;
@@ -451,136 +485,140 @@ entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
* should update it, update old child blkno to new right page
* if child split occurred
*/
-static BlockNumber
-entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
+static void
+entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
+ GinBtreeEntryInsertData *insertData, BlockNumber updateblkno)
{
- BlockNumber ret = InvalidBlockNumber;
-
- Assert(btree->entry);
+ Assert(insertData->entry);
Assert(!GinPageIsData(page));
- if (btree->isDelete)
+ if (insertData->isDelete)
{
Assert(GinPageIsLeaf(page));
PageIndexTupleDelete(page, off);
}
- if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
+ if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
- GinSetDownlink(itup, btree->rightblkno);
- ret = btree->rightblkno;
+ GinSetDownlink(itup, updateblkno);
}
-
- btree->rightblkno = InvalidBlockNumber;
-
- return ret;
}
/*
* Place tuple on page and fills WAL record
+ *
+ * If the tuple doesn't fit, returns false without modifying the page.
+ *
+ * On insertion to an internal node, in addition to inserting the given item,
+ * the downlink of the existing item at 'off' is updated to point to
+ * 'updateblkno'.
*/
-static void
-entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
+static GinPlaceToPageRC
+entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertPayload, BlockNumber updateblkno,
+ XLogRecData **prdata, Page *newlpage, Page *newrpage)
{
+ GinBtreeEntryInsertData *insertData = insertPayload;
Page page = BufferGetPage(buf);
+ OffsetNumber off = stack->off;
OffsetNumber placed;
int cnt = 0;
/* these must be static so they can be returned to caller */
static XLogRecData rdata[3];
- static ginxlogInsert data;
+ static ginxlogInsertEntry data;
+
+ /* quick exit if it doesn't fit */
+ if (!entryIsEnoughSpace(btree, buf, off, insertData))
+ {
+ entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
+ prdata, newlpage, newrpage);
+ return SPLIT;
+ }
+
+ START_CRIT_SECTION();
*prdata = rdata;
- data.updateBlkno = entryPreparePage(btree, page, off);
+ entryPreparePage(btree, page, off, insertData, updateblkno);
- placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
+ placed = PageAddItem(page,
+ (Item) insertData->entry,
+ IndexTupleSize(insertData->entry),
+ off, false, false);
if (placed != off)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
- data.node = btree->index->rd_node;
- data.blkno = BufferGetBlockNumber(buf);
+ data.isDelete = insertData->isDelete;
data.offset = off;
- data.nitem = 1;
- data.isDelete = btree->isDelete;
- data.isData = false;
- data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
-
- /*
- * Prevent full page write if child's split occurs. That is needed to
- * remove incomplete splits while replaying WAL
- *
- * data.updateBlkno contains new block number (of newly created right
- * page) for recently splited page.
- */
- if (data.updateBlkno == InvalidBlockNumber)
- {
- rdata[0].buffer = buf;
- rdata[0].buffer_std = TRUE;
- rdata[0].data = NULL;
- rdata[0].len = 0;
- rdata[0].next = &rdata[1];
- cnt++;
- }
- rdata[cnt].buffer = InvalidBuffer;
+ rdata[cnt].buffer = buf;
+ rdata[cnt].buffer_std = true;
rdata[cnt].data = (char *) &data;
- rdata[cnt].len = sizeof(ginxlogInsert);
+ rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple);
rdata[cnt].next = &rdata[cnt + 1];
cnt++;
- rdata[cnt].buffer = InvalidBuffer;
- rdata[cnt].data = (char *) btree->entry;
- rdata[cnt].len = IndexTupleSize(btree->entry);
+ rdata[cnt].buffer = buf;
+ rdata[cnt].buffer_std = true;
+ rdata[cnt].data = (char *) insertData->entry;
+ rdata[cnt].len = IndexTupleSize(insertData->entry);
rdata[cnt].next = NULL;
- btree->entry = NULL;
+ return INSERTED;
}
/*
* Place tuple and split page, original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data.
+ * returns shadow pages filled with new data.
* Tuples are distributed between pages by equal size on its, not
* an equal number!
*/
-static Page
-entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
+static void
+entrySplitPage(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertPayload,
+ BlockNumber updateblkno, XLogRecData **prdata,
+ Page *newlpage, Page *newrpage)
{
+ GinBtreeEntryInsertData *insertData = insertPayload;
+ OffsetNumber off = stack->off;
OffsetNumber i,
maxoff,
separator = InvalidOffsetNumber;
Size totalsize = 0;
+ Size tupstoresize;
Size lsize = 0,
size;
char *ptr;
- IndexTuple itup,
- leftrightmost = NULL;
+ IndexTuple itup;
Page page;
- Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
- Page rpage = BufferGetPage(rbuf);
+ Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
+ Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
Size pageSize = PageGetPageSize(lpage);
/* these must be static so they can be returned to caller */
static XLogRecData rdata[2];
- static ginxlogSplit data;
+ static ginxlogSplitEntry data;
static char tupstore[2 * BLCKSZ];
*prdata = rdata;
- data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
- InvalidOffsetNumber : GinGetDownlink(btree->entry);
- data.updateBlkno = entryPreparePage(btree, lpage, off);
+ entryPreparePage(btree, lpage, off, insertData, updateblkno);
+ /*
+ * First, append all the existing tuples and the new tuple we're inserting
+ * one after another in a temporary workspace.
+ */
maxoff = PageGetMaxOffsetNumber(lpage);
ptr = tupstore;
-
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
if (i == off)
{
- size = MAXALIGN(IndexTupleSize(btree->entry));
- memcpy(ptr, btree->entry, size);
+ size = MAXALIGN(IndexTupleSize(insertData->entry));
+ memcpy(ptr, insertData->entry, size);
ptr += size;
totalsize += size + sizeof(ItemIdData);
}
@@ -594,12 +632,17 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
if (off == maxoff + 1)
{
- size = MAXALIGN(IndexTupleSize(btree->entry));
- memcpy(ptr, btree->entry, size);
+ size = MAXALIGN(IndexTupleSize(insertData->entry));
+ memcpy(ptr, insertData->entry, size);
ptr += size;
totalsize += size + sizeof(ItemIdData);
}
+ tupstoresize = ptr - tupstore;
+ /*
+ * Initialize the left and right pages, and copy all the tuples back to
+ * them.
+ */
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
@@ -620,7 +663,6 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
}
else
{
- leftrightmost = itup;
lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
}
@@ -630,48 +672,41 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
ptr += MAXALIGN(IndexTupleSize(itup));
}
- btree->entry = GinFormInteriorTuple(leftrightmost, lpage,
- BufferGetBlockNumber(lbuf));
-
- btree->rightblkno = BufferGetBlockNumber(rbuf);
-
- data.node = btree->index->rd_node;
- data.rootBlkno = InvalidBlockNumber;
- data.lblkno = BufferGetBlockNumber(lbuf);
- data.rblkno = BufferGetBlockNumber(rbuf);
data.separator = separator;
data.nitem = maxoff;
- data.isData = FALSE;
- data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
- data.isRootSplit = FALSE;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
- rdata[0].len = sizeof(ginxlogSplit);
+ rdata[0].len = sizeof(ginxlogSplitEntry);
rdata[0].next = &rdata[1];
rdata[1].buffer = InvalidBuffer;
rdata[1].data = tupstore;
- rdata[1].len = MAXALIGN(totalsize);
+ rdata[1].len = tupstoresize;
rdata[1].next = NULL;
- return lpage;
+ *newlpage = lpage;
+ *newrpage = rpage;
}
/*
- * return newly allocated rightmost tuple
+ * Construct insertion payload for inserting the downlink for given buffer.
*/
-IndexTuple
-ginPageGetLinkItup(Buffer buf)
+static void *
+entryPrepareDownlink(GinBtree btree, Buffer lbuf)
{
- IndexTuple itup,
- nitup;
- Page page = BufferGetPage(buf);
+ GinBtreeEntryInsertData *insertData;
+ Page lpage = BufferGetPage(lbuf);
+ BlockNumber lblkno = BufferGetBlockNumber(lbuf);
+ IndexTuple itup;
- itup = getRightMostTuple(page);
- nitup = GinFormInteriorTuple(itup, page, BufferGetBlockNumber(buf));
+ itup = getRightMostTuple(lpage);
- return nitup;
+ insertData = palloc(sizeof(GinBtreeEntryInsertData));
+ insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno);
+ insertData->isDelete = false;
+
+ return insertData;
}
/*
@@ -679,20 +714,19 @@ ginPageGetLinkItup(Buffer buf)
* Also called from ginxlog, should not use btree
*/
void
-ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
+ginEntryFillRoot(GinBtree btree, Page root,
+ BlockNumber lblkno, Page lpage,
+ BlockNumber rblkno, Page rpage)
{
- Page page;
IndexTuple itup;
- page = BufferGetPage(root);
-
- itup = ginPageGetLinkItup(lbuf);
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
+ if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
pfree(itup);
- itup = ginPageGetLinkItup(rbuf);
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
+ if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
pfree(itup);
}
@@ -711,25 +745,23 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
memset(btree, 0, sizeof(GinBtreeData));
btree->index = ginstate->index;
+ btree->rootBlkno = GIN_ROOT_BLKNO;
btree->ginstate = ginstate;
btree->findChildPage = entryLocateEntry;
+ btree->getLeftMostChild = entryGetLeftMostPage;
btree->isMoveRight = entryIsMoveRight;
btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr;
- btree->getLeftMostPage = entryGetLeftMostPage;
- btree->isEnoughSpace = entryIsEnoughSpace;
btree->placeToPage = entryPlaceToPage;
- btree->splitPage = entrySplitPage;
btree->fillRoot = ginEntryFillRoot;
+ btree->prepareDownlink = entryPrepareDownlink;
btree->isData = FALSE;
- btree->searchMode = FALSE;
btree->fullScan = FALSE;
btree->isBuild = FALSE;
btree->entryAttnum = attnum;
btree->entryKey = key;
btree->entryCategory = category;
- btree->isDelete = FALSE;
}
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index b9bfde2ee4..09c3e39bf3 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -7,7 +7,7 @@
* transfer pending entries into the regular index structure. This
* wins because bulk insertion is much more efficient than retail.
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -119,15 +119,13 @@ writeListPage(Relation index, Buffer buffer,
rdata[0].len = sizeof(ginxlogInsertListPage);
rdata[0].next = rdata + 1;
- rdata[1].buffer = buffer;
- rdata[1].buffer_std = true;
+ rdata[1].buffer = InvalidBuffer;
rdata[1].data = workspace;
rdata[1].len = size;
rdata[1].next = NULL;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata);
PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
}
/* get free space before releasing buffer */
@@ -290,7 +288,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
if (metadata->head == InvalidBlockNumber)
{
/*
- * Main list is empty, so just copy sublist into main list
+ * Main list is empty, so just insert sublist as main list
*/
START_CRIT_SECTION();
@@ -313,6 +311,14 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
+ rdata[0].next = rdata + 1;
+
+ rdata[1].buffer = buffer;
+ rdata[1].buffer_std = true;
+ rdata[1].data = NULL;
+ rdata[1].len = 0;
+ rdata[1].next = NULL;
+
Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);
START_CRIT_SECTION();
@@ -400,12 +406,10 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata);
PageSetLSN(metapage, recptr);
- PageSetTLI(metapage, ThisTimeLineID);
if (buffer != InvalidBuffer)
{
PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
}
}
@@ -436,7 +440,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
* Create temporary index tuples for a single indexable item (one index column
* for the heap tuple specified by ht_ctid), and append them to the array
* in *collector. They will subsequently be written out using
- * ginHeapTupleFastInsert. Note that to guarantee consistent state, all
+ * ginHeapTupleFastInsert. Note that to guarantee consistent state, all
* temp tuples for a given heap tuple must be written in one call to
* ginHeapTupleFastInsert.
*/
@@ -482,7 +486,7 @@ ginHeapTupleFastCollect(GinState *ginstate,
IndexTuple itup;
itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
- NULL, 0, true);
+ NULL, 0, 0, true);
itup->t_tid = *ht_ctid;
collector->tuples[collector->ntuples++] = itup;
collector->sumsize += IndexTupleSize(itup);
@@ -586,13 +590,11 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata);
PageSetLSN(metapage, recptr);
- PageSetTLI(metapage, ThisTimeLineID);
for (i = 0; i < data.ndeleted; i++)
{
page = BufferGetPage(buffers[i]);
PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
}
}
@@ -705,7 +707,7 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka,
*
* This can be called concurrently by multiple backends, so it must cope.
* On first glance it looks completely not concurrent-safe and not crash-safe
- * either. The reason it's okay is that multiple insertion of the same entry
+ * either. The reason it's okay is that multiple insertion of the same entry
* is detected and treated as a no-op by gininsert.c. If we crash after
* posting entries to the main index and before removing them from the
* pending list, it's okay because when we redo the posting later on, nothing
@@ -759,7 +761,7 @@ ginInsertCleanup(GinState *ginstate,
LockBuffer(metabuffer, GIN_UNLOCK);
/*
- * Initialize. All temporary space will be in opCtx
+ * Initialize. All temporary space will be in opCtx
*/
opCtx = AllocSetContextCreate(CurrentMemoryContext,
"GIN insert cleanup temporary context",
@@ -853,7 +855,7 @@ ginInsertCleanup(GinState *ginstate,
/*
* While we left the page unlocked, more stuff might have gotten
- * added to it. If so, process those entries immediately. There
+ * added to it. If so, process those entries immediately. There
* shouldn't be very many, so we don't worry about the fact that
* we're doing this with exclusive lock. Insertion algorithm
* guarantees that inserted row(s) will not continue on next page.
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 022bd27b23..144ed504dc 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -4,7 +4,7 @@
* fetch tuples from a GIN scan.
*
*
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@@ -20,6 +20,8 @@
#include "utils/datum.h"
#include "utils/memutils.h"
+/* GUC parameter */
+int GinFuzzySearchLimit = 0;
typedef struct pendingPosition
{
@@ -32,67 +34,6 @@ typedef struct pendingPosition
/*
- * Convenience function for invoking a key's consistentFn
- */
-static bool
-callConsistentFn(GinState *ginstate, GinScanKey key)
-{
- /*
- * If we're dealing with a dummy EVERYTHING key, we don't want to call the
- * consistentFn; just claim it matches.
- */
- if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING)
- {
- key->recheckCurItem = false;
- return true;
- }
-
- /*
- * Initialize recheckCurItem in case the consistentFn doesn't know it
- * should set it. The safe assumption in that case is to force recheck.
- */
- key->recheckCurItem = true;
-
- return DatumGetBool(FunctionCall8Coll(&ginstate->consistentFn[key->attnum - 1],
- ginstate->supportCollation[key->attnum - 1],
- PointerGetDatum(key->entryRes),
- UInt16GetDatum(key->strategy),
- key->query,
- UInt32GetDatum(key->nuserentries),
- PointerGetDatum(key->extra_data),
- PointerGetDatum(&key->recheckCurItem),
- PointerGetDatum(key->queryValues),
- PointerGetDatum(key->queryCategories)));
-}
-
-/*
- * Tries to refind previously taken ItemPointer on a posting page.
- */
-static bool
-findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off)
-{
- OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
- int res;
-
- if (GinPageGetOpaque(page)->flags & GIN_DELETED)
- /* page was deleted by concurrent vacuum */
- return false;
-
- /*
- * scan page to find equal or first greater value
- */
- for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
- {
- res = ginCompareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
-
- if (res <= 0)
- return true;
- }
-
- return false;
-}
-
-/*
* Goes to the next page if current offset is outside of bounds
*/
static bool
@@ -105,16 +46,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
/*
* We scanned the whole page, so we should take right page
*/
- stack->blkno = GinPageGetOpaque(page)->rightlink;
-
if (GinPageRightMost(page))
return false; /* no more pages */
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer,
- btree->index,
- stack->blkno);
- LockBuffer(stack->buffer, GIN_SHARE);
+ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+ stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber;
}
@@ -129,19 +65,17 @@ static void
scanPostingTree(Relation index, GinScanEntry scanEntry,
BlockNumber rootPostingTree)
{
- GinPostingTreeScan *gdi;
+ GinBtreeData btree;
+ GinBtreeStack *stack;
Buffer buffer;
Page page;
- BlockNumber blkno;
/* Descend to the leftmost leaf page */
- gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
-
- buffer = ginScanBeginPostingTree(gdi);
+ stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
+ buffer = stack->buffer;
IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
- freeGinBtreeStack(gdi->stack);
- pfree(gdi);
+ freeGinBtreeStack(stack);
/*
* Loop iterates through all leaf pages of posting tree
@@ -149,23 +83,17 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
for (;;)
{
page = BufferGetPage(buffer);
-
- if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 &&
- GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
+ if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
{
- tbm_add_tuples(scanEntry->matchBitmap,
- (ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber),
- GinPageGetOpaque(page)->maxoff, false);
- scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff;
+ int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
+
+ scanEntry->predictNumberResult += n;
}
if (GinPageRightMost(page))
break; /* no more pages */
- blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- buffer = ReleaseAndReadBuffer(buffer, index, blkno);
- LockBuffer(buffer, GIN_SHARE);
+ buffer = ginStepRight(buffer, index, GIN_SHARE);
}
UnlockReleaseBuffer(buffer);
@@ -173,7 +101,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
/*
* Collects TIDs into scanEntry->matchBitmap for all heap tuples that
- * match the search entry. This supports three different match modes:
+ * match the search entry. This supports three different match modes:
*
* 1. Partial-match support: scan from current point until the
* comparePartialFn says we're done.
@@ -269,7 +197,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
/*
* In ALL mode, we are not interested in null items, so we can
* stop if we get to a null-item placeholder (which will be the
- * last entry for a given attnum). We do want to include NULL_KEY
+ * last entry for a given attnum). We do want to include NULL_KEY
* and EMPTY_ITEM entries, though.
*/
if (icategory == GIN_CAT_NULL_ITEM)
@@ -344,8 +272,11 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
}
else
{
- tbm_add_tuples(scanEntry->matchBitmap,
- GinGetPosting(itup), GinGetNPosting(itup), false);
+ ItemPointer ipd;
+ int nipd;
+
+ ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
+ tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
scanEntry->predictNumberResult += GinGetNPosting(itup);
}
@@ -354,8 +285,6 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
*/
stack->off++;
}
-
- return true;
}
/*
@@ -387,8 +316,7 @@ restartScanEntry:
ginPrepareEntryScan(&btreeEntry, entry->attnum,
entry->queryKey, entry->queryCategory,
ginstate);
- btreeEntry.searchMode = TRUE;
- stackEntry = ginFindLeafPage(&btreeEntry, NULL);
+ stackEntry = ginFindLeafPage(&btreeEntry, true);
page = BufferGetPage(stackEntry->buffer);
needUnlock = TRUE;
@@ -438,8 +366,9 @@ restartScanEntry:
if (GinIsPostingTree(itup))
{
BlockNumber rootPostingTree = GinGetPostingTree(itup);
- GinPostingTreeScan *gdi;
+ GinBtreeStack *stack;
Page page;
+ ItemPointerData minItem;
/*
* We should unlock entry page before touching posting tree to
@@ -450,9 +379,10 @@ restartScanEntry:
*/
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
needUnlock = FALSE;
- gdi = ginPrepareScanPostingTree(ginstate->index, rootPostingTree, TRUE);
- entry->buffer = ginScanBeginPostingTree(gdi);
+ stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
+ rootPostingTree);
+ entry->buffer = stack->buffer;
/*
* We keep buffer pinned because we need to prevent deletion of
@@ -462,26 +392,25 @@ restartScanEntry:
IncrBufferRefCount(entry->buffer);
page = BufferGetPage(entry->buffer);
- entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff;
/*
- * Keep page content in memory to prevent durable page locking
+ * Load the first page into memory.
*/
- entry->list = (ItemPointerData *) palloc(BLCKSZ);
- entry->nlist = GinPageGetOpaque(page)->maxoff;
- memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber),
- GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+ ItemPointerSetMin(&minItem);
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
+
+ entry->predictNumberResult = stack->predictNumber * entry->nlist;
LockBuffer(entry->buffer, GIN_UNLOCK);
- freeGinBtreeStack(gdi->stack);
- pfree(gdi);
+ freeGinBtreeStack(stack);
entry->isFinished = FALSE;
}
else if (GinGetNPosting(itup) > 0)
{
- entry->nlist = GinGetNPosting(itup);
- entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist);
- memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist);
+ entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+ &entry->nlist);
+ entry->predictNumberResult = entry->nlist;
+
entry->isFinished = FALSE;
}
}
@@ -491,13 +420,106 @@ restartScanEntry:
freeGinBtreeStack(stackEntry);
}
+/*
+ * Comparison function for scan entry indexes. Sorts by predictNumberResult,
+ * least frequent items first.
+ */
+static int
+entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
+{
+ const GinScanKey key = (const GinScanKey) arg;
+ int i1 = *(const int *) a1;
+ int i2 = *(const int *) a2;
+ uint32 n1 = key->scanEntry[i1]->predictNumberResult;
+ uint32 n2 = key->scanEntry[i2]->predictNumberResult;
+
+ if (n1 < n2)
+ return -1;
+ else if (n1 == n2)
+ return 0;
+ else
+ return 1;
+}
+
static void
-startScanKey(GinState *ginstate, GinScanKey key)
+startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
{
+ MemoryContext oldCtx = CurrentMemoryContext;
+ int i;
+ int j;
+ int *entryIndexes;
+
ItemPointerSetMin(&key->curItem);
key->curItemMatches = false;
key->recheckCurItem = false;
key->isFinished = false;
+
+ /*
+ * Divide the entries into two distinct sets: required and additional.
+ * Additional entries are not enough for a match alone, without any items
+ * from the required set, but are needed by the consistent function to
+ * decide if an item matches. When scanning, we can skip over items from
+ * additional entries that have no corresponding matches in any of the
+ * required entries. That speeds up queries like "frequent & rare"
+ * considerably, if the frequent term can be put in the additional set.
+ *
+ * There can be many legal ways to divide them entries into these two
+ * sets. A conservative division is to just put everything in the required
+ * set, but the more you can put in the additional set, the more you can
+ * skip during the scan. To maximize skipping, we try to put as many
+ * frequent items as possible into additional, and less frequent ones into
+ * required. To do that, sort the entries by frequency
+ * (predictNumberResult), and put entries into the required set in that
+ * order, until the consistent function says that none of the remaining
+ * entries can form a match, without any items from the required set. The
+ * rest go to the additional set.
+ */
+ if (key->nentries > 1)
+ {
+ MemoryContextSwitchTo(so->tempCtx);
+
+ entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
+ for (i = 0; i < key->nentries; i++)
+ entryIndexes[i] = i;
+ qsort_arg(entryIndexes, key->nentries, sizeof(int),
+ entryIndexByFrequencyCmp, key);
+
+ for (i = 0; i < key->nentries - 1; i++)
+ {
+ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
+ for (j = 0; j <= i; j++)
+ key->entryRes[entryIndexes[j]] = GIN_FALSE;
+ for (j = i + 1; j < key->nentries; j++)
+ key->entryRes[entryIndexes[j]] = GIN_MAYBE;
+
+ if (key->triConsistentFn(key) == GIN_FALSE)
+ break;
+ }
+ /* i is now the last required entry. */
+
+ MemoryContextSwitchTo(oldCtx);
+
+ key->nrequired = i + 1;
+ key->nadditional = key->nentries - key->nrequired;
+ key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
+ key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+
+ j = 0;
+ for (i = 0; i < key->nrequired; i++)
+ key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
+ for (i = 0; i < key->nadditional; i++)
+ key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
+
+ /* clean up after consistentFn calls (also frees entryIndexes) */
+ MemoryContextReset(so->tempCtx);
+ }
+ else
+ {
+ key->nrequired = 1;
+ key->nadditional = 0;
+ key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
+ key->requiredEntries[0] = key->scanEntry[0];
+ }
}
static void
@@ -532,84 +554,146 @@ startScan(IndexScanDesc scan)
}
for (i = 0; i < so->nkeys; i++)
- startScanKey(ginstate, so->keys + i);
+ startScanKey(ginstate, so, so->keys + i);
}
/*
- * Gets next ItemPointer from PostingTree. Note, that we copy
- * page into GinScanEntry->list array and unlock page, but keep it pinned
- * to prevent interference with vacuum
+ * Load the next batch of item pointers from a posting tree.
+ *
+ * Note that we copy the page into GinScanEntry->list array and unlock it, but
+ * keep it pinned to prevent interference with vacuum.
*/
static void
-entryGetNextItem(GinState *ginstate, GinScanEntry entry)
+entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
{
Page page;
- BlockNumber blkno;
+ int i;
+ bool stepright;
+
+ if (!BufferIsValid(entry->buffer))
+ {
+ entry->isFinished = true;
+ return;
+ }
+ /*
+ * We have two strategies for finding the correct page: step right from
+ * the current page, or descend the tree again from the root. If
+ * advancePast equals the current item, the next matching item should be
+ * on the next page, so we step right. Otherwise, descend from root.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
+ {
+ stepright = true;
+ LockBuffer(entry->buffer, GIN_SHARE);
+ }
+ else
+ {
+ GinBtreeStack *stack;
+
+ ReleaseBuffer(entry->buffer);
+
+ /*
+ * Set the search key, and find the correct leaf page.
+ */
+ if (ItemPointerIsLossyPage(&advancePast))
+ {
+ ItemPointerSet(&entry->btree.itemptr,
+ GinItemPointerGetBlockNumber(&advancePast) + 1,
+ FirstOffsetNumber);
+ }
+ else
+ {
+ entry->btree.itemptr = advancePast;
+ entry->btree.itemptr.ip_posid++;
+ }
+ entry->btree.fullScan = false;
+ stack = ginFindLeafPage(&entry->btree, true);
+
+ /* we don't need the stack, just the buffer. */
+ entry->buffer = stack->buffer;
+ IncrBufferRefCount(entry->buffer);
+ freeGinBtreeStack(stack);
+ stepright = false;
+ }
+
+ elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
+ GinItemPointerGetBlockNumber(&advancePast),
+ GinItemPointerGetOffsetNumber(&advancePast),
+ !stepright);
+
+ page = BufferGetPage(entry->buffer);
for (;;)
{
- if (entry->offset < entry->nlist)
+ entry->offset = InvalidOffsetNumber;
+ if (entry->list)
{
- entry->curItem = entry->list[entry->offset++];
- return;
+ pfree(entry->list);
+ entry->list = NULL;
+ entry->nlist = 0;
}
- LockBuffer(entry->buffer, GIN_SHARE);
- page = BufferGetPage(entry->buffer);
- for (;;)
+ if (stepright)
{
/*
- * It's needed to go by right link. During that we should refind
- * first ItemPointer greater that stored
+ * We've processed all the entries on this page. If it was the
+ * last page in the tree, we're done.
*/
-
- blkno = GinPageGetOpaque(page)->rightlink;
-
- LockBuffer(entry->buffer, GIN_UNLOCK);
- if (blkno == InvalidBlockNumber)
+ if (GinPageRightMost(page))
{
- ReleaseBuffer(entry->buffer);
- ItemPointerSetInvalid(&entry->curItem);
+ UnlockReleaseBuffer(entry->buffer);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
- entry->buffer = ReleaseAndReadBuffer(entry->buffer,
- ginstate->index,
- blkno);
- LockBuffer(entry->buffer, GIN_SHARE);
+ /*
+ * Step to next page, following the right link. then find the
+ * first ItemPointer greater than advancePast.
+ */
+ entry->buffer = ginStepRight(entry->buffer,
+ ginstate->index,
+ GIN_SHARE);
page = BufferGetPage(entry->buffer);
+ }
+ stepright = true;
- entry->offset = InvalidOffsetNumber;
- if (!ItemPointerIsValid(&entry->curItem) ||
- findItemInPostingPage(page, &entry->curItem, &entry->offset))
- {
- /*
- * Found position equal to or greater than stored
- */
- entry->nlist = GinPageGetOpaque(page)->maxoff;
- memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber),
- GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+ if (GinPageGetOpaque(page)->flags & GIN_DELETED)
+ continue; /* page was deleted by concurrent vacuum */
- LockBuffer(entry->buffer, GIN_UNLOCK);
+ /*
+ * The first item > advancePast might not be on this page, but
+ * somewhere to the right, if the page was split, or a non-match from
+ * another key in the query allowed us to skip some items from this
+ * entry. Keep following the right-links until we re-find the correct
+ * page.
+ */
+ if (!GinPageRightMost(page) &&
+ ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
+ {
+ /*
+ * the item we're looking is > the right bound of the page, so it
+ * can't be on this page.
+ */
+ continue;
+ }
- if (!ItemPointerIsValid(&entry->curItem) ||
- ginCompareItemPointers(&entry->curItem,
- entry->list + entry->offset - 1) == 0)
- {
- /*
- * First pages are deleted or empty, or we found exact
- * position, so break inner loop and continue outer one.
- */
- break;
- }
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
- /*
- * Find greater than entry->curItem position, store it.
- */
- entry->curItem = entry->list[entry->offset - 1];
+ for (i = 0; i < entry->nlist; i++)
+ {
+ if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
+ {
+ entry->offset = i;
+ if (GinPageRightMost(page))
+ {
+ /* after processing the copied items, we're done. */
+ UnlockReleaseBuffer(entry->buffer);
+ entry->buffer = InvalidBuffer;
+ }
+ else
+ LockBuffer(entry->buffer, GIN_UNLOCK);
return;
}
}
@@ -620,10 +704,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/*
- * Sets entry->curItem to next heap item pointer for one entry of one scan key,
- * or sets entry->isFinished to TRUE if there are no more.
+ * Sets entry->curItem to next heap item pointer > advancePast, for one entry
+ * of one scan key, or sets entry->isFinished to TRUE if there are no more.
*
- * Item pointers must be returned in ascending order.
+ * Item pointers are returned in ascending order.
*
* Note: this can return a "lossy page" item pointer, indicating that the
* entry potentially matches all items on that heap page. However, it is
@@ -633,16 +717,33 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
* current implementation this is guaranteed by the behavior of tidbitmaps.
*/
static void
-entryGetItem(GinState *ginstate, GinScanEntry entry)
+entryGetItem(GinState *ginstate, GinScanEntry entry,
+ ItemPointerData advancePast)
{
Assert(!entry->isFinished);
+ Assert(!ItemPointerIsValid(&entry->curItem) ||
+ ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+
if (entry->matchBitmap)
{
+ /* A bitmap result */
+ BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
+ OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
+ bool gotitem = false;
+
do
{
- if (entry->matchResult == NULL ||
- entry->offset >= entry->matchResult->ntuples)
+ /*
+ * If we've exhausted all items on this block, move to next block
+ * in the bitmap.
+ */
+ while (entry->matchResult == NULL ||
+ (entry->matchResult->ntuples >= 0 &&
+ entry->offset >= entry->matchResult->ntuples) ||
+ entry->matchResult->blockno < advancePastBlk ||
+ (ItemPointerIsLossyPage(&advancePast) &&
+ entry->matchResult->blockno == advancePastBlk))
{
entry->matchResult = tbm_iterate(entry->matchIterator);
@@ -663,54 +764,103 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
*/
entry->offset = 0;
}
+ if (entry->isFinished)
+ break;
+ /*
+ * We're now on the first page after advancePast which has any
+ * items on it. If it's a lossy result, return that.
+ */
if (entry->matchResult->ntuples < 0)
{
- /*
- * lossy result, so we need to check the whole page
- */
ItemPointerSetLossyPage(&entry->curItem,
entry->matchResult->blockno);
/*
* We might as well fall out of the loop; we could not
* estimate number of results on this page to support correct
- * reducing of result even if it's enabled
+ * reducing of result even if it's enabled.
*/
+ gotitem = true;
break;
}
+ /*
+ * Not a lossy page. Skip over any offsets <= advancePast, and
+ * return that.
+ */
+ if (entry->matchResult->blockno == advancePastBlk)
+ {
+ /*
+ * First, do a quick check against the last offset on the
+ * page. If that's > advancePast, so are all the other
+ * offsets.
+ */
+ if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
+ {
+ entry->offset = entry->matchResult->ntuples;
+ continue;
+ }
+
+ /* Otherwise scan to find the first item > advancePast */
+ while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+ entry->offset++;
+ }
+
ItemPointerSet(&entry->curItem,
entry->matchResult->blockno,
entry->matchResult->offsets[entry->offset]);
entry->offset++;
- } while (entry->reduceResult == TRUE && dropItem(entry));
+ gotitem = true;
+ } while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry)));
}
else if (!BufferIsValid(entry->buffer))
{
- entry->offset++;
- if (entry->offset <= entry->nlist)
- entry->curItem = entry->list[entry->offset - 1];
- else
+ /*
+ * A posting list from an entry tuple, or the last page of a posting
+ * tree.
+ */
+ do
{
- ItemPointerSetInvalid(&entry->curItem);
- entry->isFinished = TRUE;
- }
+ if (entry->offset >= entry->nlist)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ entry->isFinished = TRUE;
+ break;
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+ } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+ /* XXX: shouldn't we apply the fuzzy search limit here? */
}
else
{
+ /* A posting tree */
do
{
- entryGetNextItem(ginstate, entry);
- } while (entry->isFinished == FALSE &&
- entry->reduceResult == TRUE &&
- dropItem(entry));
+ /* If we've processed the current batch, load more items */
+ while (entry->offset >= entry->nlist)
+ {
+ entryLoadMoreItems(ginstate, entry, advancePast);
+
+ if (entry->isFinished)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ return;
+ }
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+
+ } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
+ (entry->reduceResult == TRUE && dropItem(entry)));
}
}
/*
- * Identify the "current" item among the input entry streams for this scan key,
- * and test whether it passes the scan key qual condition.
+ * Identify the "current" item among the input entry streams for this scan key
+ * that is greater than advancePast, and test whether it passes the scan key
+ * qual condition.
*
* The current item is the smallest curItem among the inputs. key->curItem
* is set to that value. key->curItemMatches is set to indicate whether that
@@ -729,21 +879,30 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
* logic in scanGetItem.)
*/
static void
-keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
+ ItemPointerData advancePast)
{
ItemPointerData minItem;
ItemPointerData curPageLossy;
uint32 i;
- uint32 lossyEntry;
bool haveLossyEntry;
GinScanEntry entry;
- bool res;
+ GinTernaryValue res;
MemoryContext oldCtx;
+ bool allFinished;
Assert(!key->isFinished);
/*
- * Find the minimum of the active entry curItems.
+ * We might have already tested this item; if so, no need to repeat work.
+ * (Note: the ">" case can happen, if advancePast is exact but we
+ * previously had to set curItem to a lossy-page pointer.)
+ */
+ if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
+ return;
+
+ /*
+ * Find the minimum item > advancePast among the active entry streams.
*
* Note: a lossy-page entry is encoded by a ItemPointer with max value for
* offset (0xffff), so that it will sort after any exact entries for the
@@ -751,16 +910,34 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* pointers, which is good.
*/
ItemPointerSetMax(&minItem);
-
- for (i = 0; i < key->nentries; i++)
+ allFinished = true;
+ for (i = 0; i < key->nrequired; i++)
{
- entry = key->scanEntry[i];
- if (entry->isFinished == FALSE &&
- ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ entry = key->requiredEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ /*
+ * Advance this stream if necessary.
+ *
+ * In particular, since entry->curItem was initialized with
+ * ItemPointerSetMin, this ensures we fetch the first item for each
+ * entry on the first call.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast);
+ if (entry->isFinished)
+ continue;
+ }
+
+ allFinished = false;
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
minItem = entry->curItem;
}
- if (ItemPointerIsMax(&minItem))
+ if (allFinished)
{
/* all entries are finished */
key->isFinished = TRUE;
@@ -768,46 +945,96 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
}
/*
- * We might have already tested this item; if so, no need to repeat work.
- * (Note: the ">" case can happen, if minItem is exact but we previously
- * had to set curItem to a lossy-page pointer.)
+ * Ok, we now know that there are no matches < minItem.
+ *
+ * If minItem is lossy, it means that there were no exact items on the
+ * page among requiredEntries, because lossy pointers sort after exact
+ * items. However, there might be exact items for the same page among
+ * additionalEntries, so we mustn't advance past them.
*/
- if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
- return;
+ if (ItemPointerIsLossyPage(&minItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&minItem))
+ {
+ advancePast.ip_blkid = minItem.ip_blkid;
+ advancePast.ip_posid = 0;
+ }
+ }
+ else
+ {
+ Assert(minItem.ip_posid > 0);
+ advancePast = minItem;
+ advancePast.ip_posid--;
+ }
/*
- * OK, advance key->curItem and perform consistentFn test.
+ * We might not have loaded all the entry streams for this TID yet. We
+ * could call the consistent function, passing MAYBE for those entries, to
+ * see if it can decide if this TID matches based on the information we
+ * have. But if the consistent-function is expensive, and cannot in fact
+ * decide with partial information, that could be a big loss. So, load all
+ * the additional entries, before calling the consistent function.
*/
- key->curItem = minItem;
+ for (i = 0; i < key->nadditional; i++)
+ {
+ entry = key->additionalEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast);
+ if (entry->isFinished)
+ continue;
+ }
+
+ /*
+ * Normally, none of the items in additionalEntries can have a curItem
+ * larger than minItem. But if minItem is a lossy page, then there
+ * might be exact items on the same page among additionalEntries.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ {
+ Assert(ItemPointerIsLossyPage(&minItem));
+ minItem = entry->curItem;
+ }
+ }
/*
+ * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
+ * and perform consistentFn test.
+ *
* Lossy-page entries pose a problem, since we don't know the correct
* entryRes state to pass to the consistentFn, and we also don't know what
* its combining logic will be (could be AND, OR, or even NOT). If the
* logic is OR then the consistentFn might succeed for all items in the
* lossy page even when none of the other entries match.
*
- * If we have a single lossy-page entry then we check to see if the
- * consistentFn will succeed with only that entry TRUE. If so, we return
- * a lossy-page pointer to indicate that the whole heap page must be
+ * Our strategy is to call the tri-state consistent function, with the
+ * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+ * returns FALSE, none of the lossy items alone are enough for a match, so
+ * we don't need to return a lossy-page pointer. Otherwise, return a
+ * lossy-page pointer to indicate that the whole heap page must be
* checked. (On subsequent calls, we'll do nothing until minItem is past
* the page altogether, thus ensuring that we never return both regular
* and lossy pointers for the same page.)
*
- * This idea could be generalized to more than one lossy-page entry, but
- * ideally lossy-page entries should be infrequent so it would seldom be
- * the case that we have more than one at once. So it doesn't seem worth
- * the extra complexity to optimize that case. If we do find more than
- * one, we just punt and return a lossy-page pointer always.
+ * An exception is that it doesn't matter what we pass for lossy pointers
+ * in "hidden" entries, because the consistentFn's result can't depend on
+ * them. We could pass them as MAYBE as well, but if we're using the
+ * "shim" implementation of a tri-state consistent function (see
+ * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
+ * them as TRUE.
*
* Note that only lossy-page entries pointing to the current item's page
* should trigger this processing; we might have future lossy pages in the
* entry array, but they aren't relevant yet.
*/
+ key->curItem = minItem;
ItemPointerSetLossyPage(&curPageLossy,
GinItemPointerGetBlockNumber(&key->curItem));
-
- lossyEntry = 0;
haveLossyEntry = false;
for (i = 0; i < key->nentries; i++)
{
@@ -815,17 +1042,14 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
if (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
{
- if (haveLossyEntry)
- {
- /* Multiple lossy entries, punt */
- key->curItem = curPageLossy;
- key->curItemMatches = true;
- key->recheckCurItem = true;
- return;
- }
- lossyEntry = i;
+ if (i < key->nuserentries)
+ key->entryRes[i] = GIN_MAYBE;
+ else
+ key->entryRes[i] = GIN_TRUE;
haveLossyEntry = true;
}
+ else
+ key->entryRes[i] = GIN_FALSE;
}
/* prepare for calling consistentFn in temp context */
@@ -833,11 +1057,10 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
if (haveLossyEntry)
{
- /* Single lossy-page entry, so see if whole page matches */
- memset(key->entryRes, FALSE, key->nentries);
- key->entryRes[lossyEntry] = TRUE;
+ /* Have lossy-page entries, so see if whole page matches */
+ res = key->triConsistentFn(key);
- if (callConsistentFn(ginstate, key))
+ if (res == GIN_TRUE || res == GIN_MAYBE)
{
/* Yes, so clean up ... */
MemoryContextSwitchTo(oldCtx);
@@ -854,41 +1077,70 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
/*
* At this point we know that we don't need to return a lossy whole-page
* pointer, but we might have matches for individual exact item pointers,
- * possibly in combination with a lossy pointer. Our strategy if there's
- * a lossy pointer is to try the consistentFn both ways and return a hit
- * if it accepts either one (forcing the hit to be marked lossy so it will
- * be rechecked). An exception is that we don't need to try it both ways
- * if the lossy pointer is in a "hidden" entry, because the consistentFn's
- * result can't depend on that.
+ * possibly in combination with a lossy pointer. Pass lossy pointers as
+ * MAYBE to the ternary consistent function, to let it decide if this
+ * tuple satisfies the overall key, even though we don't know if the lossy
+ * entries match.
*
* Prepare entryRes array to be passed to consistentFn.
*/
for (i = 0; i < key->nentries; i++)
{
entry = key->scanEntry[i];
- if (entry->isFinished == FALSE &&
- ginCompareItemPointers(&entry->curItem, &key->curItem) == 0)
- key->entryRes[i] = TRUE;
+ if (entry->isFinished)
+ key->entryRes[i] = GIN_FALSE;
+#if 0
+
+ /*
+ * This case can't currently happen, because we loaded all the entries
+ * for this item earlier.
+ */
+ else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ key->entryRes[i] = GIN_MAYBE;
+#endif
+ else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ key->entryRes[i] = GIN_MAYBE;
+ else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
+ key->entryRes[i] = GIN_TRUE;
else
- key->entryRes[i] = FALSE;
+ key->entryRes[i] = GIN_FALSE;
}
- if (haveLossyEntry)
- key->entryRes[lossyEntry] = TRUE;
- res = callConsistentFn(ginstate, key);
+ res = key->triConsistentFn(key);
- if (!res && haveLossyEntry && lossyEntry < key->nuserentries)
+ switch (res)
{
- /* try the other way for the lossy item */
- key->entryRes[lossyEntry] = FALSE;
+ case GIN_TRUE:
+ key->curItemMatches = true;
+ /* triConsistentFn set recheckCurItem */
+ break;
+
+ case GIN_FALSE:
+ key->curItemMatches = false;
+ break;
- res = callConsistentFn(ginstate, key);
+ case GIN_MAYBE:
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
+
+ default:
+
+ /*
+ * the 'default' case shouldn't happen, but if the consistent
+ * function returns something bogus, this is the safe result
+ */
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
}
- key->curItemMatches = res;
- /* If we matched a lossy entry, force recheckCurItem = true */
- if (haveLossyEntry)
- key->recheckCurItem = true;
+ /*
+ * We have a tuple, and we know if it matches or not. If it's a non-match,
+ * we could continue to find the next matching tuple, but let's break out
+ * and give scanGetItem a chance to advance the other keys. They might be
+ * able to skip past to a much higher TID, allowing us to save work.
+ */
/* clean up after consistentFn calls */
MemoryContextSwitchTo(oldCtx);
@@ -905,117 +1157,121 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* keyGetItem() the combination logic is known only to the consistentFn.
*/
static bool
-scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
+scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
ItemPointerData *item, bool *recheck)
{
GinScanOpaque so = (GinScanOpaque) scan->opaque;
- GinState *ginstate = &so->ginstate;
- ItemPointerData myAdvancePast = *advancePast;
uint32 i;
- bool allFinished;
bool match;
- for (;;)
+ /*----------
+ * Advance the scan keys in lock-step, until we find an item that matches
+ * all the keys. If any key reports isFinished, meaning its subset of the
+ * entries is exhausted, we can stop. Otherwise, set *item to the next
+ * matching item.
+ *
+ * This logic works only if a keyGetItem stream can never contain both
+ * exact and lossy pointers for the same page. Else we could have a
+ * case like
+ *
+ * stream 1 stream 2
+ * ... ...
+ * 42/6 42/7
+ * 50/1 42/0xffff
+ * ... ...
+ *
+ * We would conclude that 42/6 is not a match and advance stream 1,
+ * thus never detecting the match to the lossy pointer in stream 2.
+ * (keyGetItem has a similar problem versus entryGetItem.)
+ *----------
+ */
+ do
{
- /*
- * Advance any entries that are <= myAdvancePast. In particular,
- * since entry->curItem was initialized with ItemPointerSetMin, this
- * ensures we fetch the first item for each entry on the first call.
- */
- allFinished = TRUE;
-
- for (i = 0; i < so->totalentries; i++)
- {
- GinScanEntry entry = so->entries[i];
-
- while (entry->isFinished == FALSE &&
- ginCompareItemPointers(&entry->curItem,
- &myAdvancePast) <= 0)
- entryGetItem(ginstate, entry);
-
- if (entry->isFinished == FALSE)
- allFinished = FALSE;
- }
-
- if (allFinished)
- {
- /* all entries exhausted, so we're done */
- return false;
- }
-
- /*
- * Perform the consistentFn test for each scan key. If any key
- * reports isFinished, meaning its subset of the entries is exhausted,
- * we can stop. Otherwise, set *item to the minimum of the key
- * curItems.
- */
- ItemPointerSetMax(item);
-
- for (i = 0; i < so->nkeys; i++)
+ ItemPointerSetMin(item);
+ match = true;
+ for (i = 0; i < so->nkeys && match; i++)
{
GinScanKey key = so->keys + i;
- keyGetItem(&so->ginstate, so->tempCtx, key);
+ /* Fetch the next item for this key that is > advancePast. */
+ keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
if (key->isFinished)
- return false; /* finished one of keys */
-
- if (ginCompareItemPointers(&key->curItem, item) < 0)
- *item = key->curItem;
- }
+ return false;
- Assert(!ItemPointerIsMax(item));
+ /*
+ * If it's not a match, we can immediately conclude that nothing
+ * <= this item matches, without checking the rest of the keys.
+ */
+ if (!key->curItemMatches)
+ {
+ advancePast = key->curItem;
+ match = false;
+ break;
+ }
- /*----------
- * Now *item contains first ItemPointer after previous result.
- *
- * The item is a valid hit only if all the keys succeeded for either
- * that exact TID, or a lossy reference to the same page.
- *
- * This logic works only if a keyGetItem stream can never contain both
- * exact and lossy pointers for the same page. Else we could have a
- * case like
- *
- * stream 1 stream 2
- * ... ...
- * 42/6 42/7
- * 50/1 42/0xffff
- * ... ...
- *
- * We would conclude that 42/6 is not a match and advance stream 1,
- * thus never detecting the match to the lossy pointer in stream 2.
- * (keyGetItem has a similar problem versus entryGetItem.)
- *----------
- */
- match = true;
- for (i = 0; i < so->nkeys; i++)
- {
- GinScanKey key = so->keys + i;
+ /*
+ * It's a match. We can conclude that nothing < matches, so the
+ * other key streams can skip to this item.
+ *
+ * Beware of lossy pointers, though; from a lossy pointer, we can
+ * only conclude that nothing smaller than this *block* matches.
+ */
+ if (ItemPointerIsLossyPage(&key->curItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&key->curItem))
+ {
+ advancePast.ip_blkid = key->curItem.ip_blkid;
+ advancePast.ip_posid = 0;
+ }
+ }
+ else
+ {
+ Assert(key->curItem.ip_posid > 0);
+ advancePast = key->curItem;
+ advancePast.ip_posid--;
+ }
- if (key->curItemMatches)
+ /*
+ * If this is the first key, remember this location as a potential
+ * match, and proceed to check the rest of the keys.
+ *