summaryrefslogtreecommitdiff
path: root/tools/docs/docload.py
blob: fa4072633faed09947ea821658fbf93d75fd16a7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python3

# Script to load documentation from a tarball or source directory

import sys
import os
import tarfile
import csv
import io
import re
import tidylib
from optparse import OptionParser
from configparser import ConfigParser

import psycopg2

# the Bootstrap grid classes that are added onto any images that are rendered in the docs
BOOTSTRAP_FIGURE_CLASS = r'<div\1class="figure col-xl-8 col-lg-10 col-md-12"'
# a counter that keeps track of the total number of pages (HTML, SVG) that are loaded
# into the database
pagecount = 0
# if set to "True" -- mutes any output from the script. Controlled by an option
quiet = False
# if set to "True" -- outputs extra much data (row-per-file)
verbose = False
# regular expression used to search and extract the title on a given piece of
# documentation, for further use in the application
re_titlematch = re.compile(r'<title\s*>([^<]+)</title\s*>', re.IGNORECASE)
# regular expression used to find any images that are in the HTML and apply
# additional bootstrap classes
re_figure_match = re.compile('<div([^<>]+)class="figure"', re.IGNORECASE)


# Load a single page
def load_doc_file(filename, f, c):
    """Prepares and loads a HTML file for import into the documentation database"""
    tidyopts = dict(
        drop_proprietary_attributes=1,
        alt_text='',
        hide_comments=1,
        output_xhtml=1,
        show_body_only=1,
        clean=1,
        char_encoding='utf8',
        indent='auto',
    )

    # Postgres 10 started using xml toolchain and now produces docmentation in utf8. So we need
    # to figure out which version it is.
    rawcontents = f.read()
    rawfirst = rawcontents[:50].decode('utf8', errors='ignore')
    if rawfirst.startswith('<?xml version="1.0" encoding="UTF-8"'):
        # Version 10, use utf8
        encoding = 'utf-8'
        # XML builds also don't need clean=1, and that one adds some interesting CSS properties
        del tidyopts['clean']
    else:
        encoding = 'latin1'

    # PostgreSQL prior to 11 used an older toolchain to build the docs, which does not support
    # indented HTML. So turn it off on those, but keep it on the newer versions where it works,
    # because it makes things a lot easier to debug.
    if float(ver) < 11 and float(ver) > 0:
        tidyopts['indent'] = 'no'

    # convert the raw contents to the appropriate encoding for the content that will
    # be stored in the database
    contents = str(rawcontents, encoding)

    # extract the title of the page, which is rendered in a few places in the documentation
    tm = re_titlematch.search(contents)
    if tm:
        title = tm.group(1)
    else:
        title = ""

    # find any images that are embedded in the HTML and add in the Bootstrap grid classes
    # in order to ensure they are able to display responsively
    contents = re_figure_match.sub(BOOTSTRAP_FIGURE_CLASS, contents)

    # in verbose mode, output the (filename, title) pair of the docpage that is being processed
    if verbose:
        print("--- file: %s (%s) ---" % (filename, title))

    # run libtidy on the content
    (html, errors) = tidylib.tidy_document(contents, options=tidyopts)

    # add all of the information to the CSV that will be used to load the updated
    # documentation pages into the database
    c.writerow([filename, ver, title, html])


def load_svg_file(filename, f, c):
    """Prepares and loads a SVG file for import into the documentation database"""
    # this is fairly straightforward: we just need to load the contents, and
    # set the "title" as NULL as there is no title tag
    svg = f.read()
    c.writerow([filename, ver, None, svg.decode('utf-8')])


def parse_tarfile(tarfilename):
    # this regular expression is for "newer" versions of PostgreSQL that keep all of
    # the HTML documentation built out
    re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*')
    # this regular expression is for "older" versions of PostgreSQL that keep the
    # HTML documentation in a tarball within the tarball
    re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$')

    tf = tarfile.open(tarfilename)

    for member in tf:
        if re_htmlfile.match(member.name):
            yield member.name, lambda: tf.extractfile(member)
        elif re_tarfile.match(member.name):
            # older versions of PostgreSQL kept a tarball of the documentation within the source
            # tarball, and as such will go down this path
            f = tf.extractfile(member)
            inner_tar = tarfile.open(fileobj=f)
            for inner_member in inner_tar:
                # Some old versions have index.html as a symlink - so let's
                # just ignore all symlinks to be on the safe side.
                if inner_member.issym():
                    continue

                if inner_member.name.endswith('.html') or inner_member.name.endswith('.htm'):
                    yield inner_member.name, lambda: inner_tar.extractfile(inner_member)


def parse_directory(dirname):
    for fn in os.listdir(dirname):
        if fn.endswith('.html') or fn.endswith('.svg'):
            yield fn, lambda: open(os.path.join(dirname, fn), 'rb')


# Main execution
parser = OptionParser(usage="usage: %prog [options] <version> <tarfile|directory>")
parser.add_option("-q", "--quiet", action="store_true", dest="quiet",
                  help="Run quietly (no output at all)")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
                  help="Run verbosely")
parser.add_option("-g", "--git", type=str,
                  help="Specify git hash used to load")
(options, args) = parser.parse_args()

if len(args) != 2:
    parser.print_usage()
    sys.exit(1)

quiet = options.quiet
verbose = options.verbose

if verbose and quiet:
    print("Can't be both verbose and quiet at the same time!")
    sys.exit(1)

ver = args[0]

# load the configuration that is used to connect to the database
config = ConfigParser()
config.read(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'docload.ini'))

# Load a tarfile or a "naked" directory
if os.path.isfile(args[1]):
    generator = parse_tarfile(args[1])
elif os.path.isdir(args[1]):
    generator = parse_directory(args[1])
else:
    print("File or directory %s not found" % args[1])
    sys.exit(1)


connection = psycopg2.connect(config.get('db', 'dsn'))

if not quiet:
    print("Starting load of documentation for version %s." % (ver, ))

curs = connection.cursor()
# Verify that the version exists, and what we're loading
curs.execute("SELECT current FROM core_version WHERE tree=%(v)s", {'v': ver})
r = curs.fetchall()
if len(r) != 1:
    print("Version %s not found in the system, cannot load!" % ver)
    sys.exit(1)

iscurrent = r[0][0]

# begin creating a CSV that will be used to import the documentation into the database
s = io.StringIO()
c = csv.writer(s, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

# Import each page of documentation
for filename, getter in generator:
    filename = os.path.basename(filename)
    f = getter()

    # determine if the file being loaded is an SVG or a regular doc file
    if filename.endswith('.svg'):
        load_svg_file(filename, f, c)
    else:
        load_doc_file(filename, f, c)

    # after successfully preparing the file for load, increase the page count
    pagecount += 1


if not quiet:
    print("Total parsed doc size: {:.1f} MB".format(s.tell() / (1024 * 1024)))

s.seek(0)

# Start loading the documentation into the database
# First, load the newly discovered documentation into a temporary table, where we
# can validate that we loaded exactly the number of docs that we thought we would,
# based on the page counter
curs.execute("CREATE TEMP TABLE docsload (file varchar(64) NOT NULL, version numeric(3,1) NOT NULL, title varchar(256) NOT NULL, content text)")
curs.copy_expert("COPY docsload FROM STDIN WITH CSV DELIMITER AS ';'", s)
if curs.rowcount != pagecount:
    print("Loaded invalid number of rows! {} rows for {} pages!".format(curs.rowcount, pagecount))
    sys.exit(1)

numchanges = 0

# If the previous step succeeded, delete all the documentation for the specified version
# and insert into / update the doc table the content that was loaded into the temporary table
curs.execute("DELETE FROM docs WHERE version=%(version)s AND NOT EXISTS (SELECT 1 FROM docsload WHERE docsload.file=docs.file)", {
    'version': ver,
})
numchanges += curs.rowcount
if not quiet:
    print("Deleted {} orphaned doc pages".format(curs.rowcount))

curs.execute("INSERT INTO docs (file, version, title, content) SELECT file, version, title, content FROM docsload WHERE NOT EXISTS (SELECT 1 FROM docs WHERE docs.file=docsload.file AND docs.version=%(version)s)", {
    'version': ver,
})
numchanges += curs.rowcount
if not quiet:
    print("Inserted {} new doc pages.".format(curs.rowcount))

curs.execute("UPDATE docs SET title=l.title, content=l.content FROM docsload l WHERE docs.version=%(version)s AND docs.file=l.file AND (docs.title != l.title OR docs.content != l.content)", {
    'version': ver,
})
numchanges += curs.rowcount
if not quiet:
    print("Updated {} changed doc pages.".format(curs.rowcount))

if numchanges > 0:
    # Update the docs loaded timestamp
    if ver == "0" and options.git:
        githash = options.git
    else:
        githash = ''

    curs.execute("UPDATE core_version SET docsloaded=CURRENT_TIMESTAMP, docsgit=%(git)s WHERE tree=%(v)s", {
        'v': ver,
        'git': githash,
    })

    # Issue varnish purge for all docs of this version
    if ver == "0":
        # Special handling of developer docs...
        ver = "devel"

    curs.execute("SELECT varnish_purge_xkey('pgdocs_{}')".format(ver))
    curs.execute("SELECT varnish_purge_xkey('pgdocs_all')")
    if iscurrent:
        curs.execute("SELECT varnish_purge_xkey('pgdocs_current')")

# ensure the changes are committed, and close the connection
connection.commit()
connection.close()

if not quiet:
    print("Done loading docs version %s (%i pages)." % (ver, pagecount))