#!/usr/bin/env python3 # Script to load documentation from a tarball or source directory import sys import os import tarfile import csv import io import re import tidylib from optparse import OptionParser from configparser import ConfigParser import psycopg2 # the Bootstrap grid classes that are added onto any images that are rendered in the docs BOOTSTRAP_FIGURE_CLASS = r'([^<]+)', re.IGNORECASE) # regular expression used to find any images that are in the HTML and apply # additional bootstrap classes re_figure_match = re.compile(']+)class="figure"', re.IGNORECASE) # Load a single page def load_doc_file(filename, f, c): """Prepares and loads a HTML file for import into the documentation database""" tidyopts = dict( drop_proprietary_attributes=1, alt_text='', hide_comments=1, output_xhtml=1, show_body_only=1, clean=1, char_encoding='utf8', indent='auto', ) # Postgres 10 started using xml toolchain and now produces docmentation in utf8. So we need # to figure out which version it is. rawcontents = f.read() rawfirst = rawcontents[:50].decode('utf8', errors='ignore') if rawfirst.startswith(' 0: tidyopts['indent'] = 'no' # convert the raw contents to the appropriate encoding for the content that will # be stored in the database contents = str(rawcontents, encoding) # extract the title of the page, which is rendered in a few places in the documentation tm = re_titlematch.search(contents) if tm: title = tm.group(1) else: title = "" # find any images that are embedded in the HTML and add in the Bootstrap grid classes # in order to ensure they are able to display responsively contents = re_figure_match.sub(BOOTSTRAP_FIGURE_CLASS, contents) # in verbose mode, output the (filename, title) pair of the docpage that is being processed if verbose: print("--- file: %s (%s) ---" % (filename, title)) # run libtidy on the content (html, errors) = tidylib.tidy_document(contents, options=tidyopts) # add all of the information to the CSV that will be used to load the updated # documentation pages into the database c.writerow([filename, ver, title, html]) def load_svg_file(filename, f, c): """Prepares and loads a SVG file for import into the documentation database""" # this is fairly straightforward: we just need to load the contents, and # set the "title" as NULL as there is no title tag svg = f.read() c.writerow([filename, ver, None, svg.decode('utf-8')]) def parse_tarfile(tarfilename): # this regular expression is for "newer" versions of PostgreSQL that keep all of # the HTML documentation built out re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*') # this regular expression is for "older" versions of PostgreSQL that keep the # HTML documentation in a tarball within the tarball re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$') tf = tarfile.open(tarfilename) for member in tf: if re_htmlfile.match(member.name): yield member.name, lambda: tf.extractfile(member) elif re_tarfile.match(member.name): # older versions of PostgreSQL kept a tarball of the documentation within the source # tarball, and as such will go down this path f = tf.extractfile(member) inner_tar = tarfile.open(fileobj=f) for inner_member in inner_tar: # Some old versions have index.html as a symlink - so let's # just ignore all symlinks to be on the safe side. if inner_member.issym(): continue if inner_member.name.endswith('.html') or inner_member.name.endswith('.htm'): yield inner_member.name, lambda: inner_tar.extractfile(inner_member) def parse_directory(dirname): for fn in os.listdir(dirname): if fn.endswith('.html') or fn.endswith('.svg'): yield fn, lambda: open(os.path.join(dirname, fn), 'rb') # Main execution parser = OptionParser(usage="usage: %prog [options] ") parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="Run quietly (no output at all)") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="Run verbosely") parser.add_option("-g", "--git", type=str, help="Specify git hash used to load") (options, args) = parser.parse_args() if len(args) != 2: parser.print_usage() sys.exit(1) quiet = options.quiet verbose = options.verbose if verbose and quiet: print("Can't be both verbose and quiet at the same time!") sys.exit(1) ver = args[0] # load the configuration that is used to connect to the database config = ConfigParser() config.read(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'docload.ini')) # Load a tarfile or a "naked" directory if os.path.isfile(args[1]): generator = parse_tarfile(args[1]) elif os.path.isdir(args[1]): generator = parse_directory(args[1]) else: print("File or directory %s not found" % args[1]) sys.exit(1) connection = psycopg2.connect(config.get('db', 'dsn')) if not quiet: print("Starting load of documentation for version %s." % (ver, )) curs = connection.cursor() # Verify that the version exists, and what we're loading curs.execute("SELECT current FROM core_version WHERE tree=%(v)s", {'v': ver}) r = curs.fetchall() if len(r) != 1: print("Version %s not found in the system, cannot load!" % ver) sys.exit(1) iscurrent = r[0][0] # begin creating a CSV that will be used to import the documentation into the database s = io.StringIO() c = csv.writer(s, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) # Import each page of documentation for filename, getter in generator: filename = os.path.basename(filename) f = getter() # determine if the file being loaded is an SVG or a regular doc file if filename.endswith('.svg'): load_svg_file(filename, f, c) else: load_doc_file(filename, f, c) # after successfully preparing the file for load, increase the page count pagecount += 1 if not quiet: print("Total parsed doc size: {:.1f} MB".format(s.tell() / (1024 * 1024))) s.seek(0) # Start loading the documentation into the database # First, load the newly discovered documentation into a temporary table, where we # can validate that we loaded exactly the number of docs that we thought we would, # based on the page counter curs.execute("CREATE TEMP TABLE docsload (file varchar(64) NOT NULL, version numeric(3,1) NOT NULL, title varchar(256) NOT NULL, content text)") curs.copy_expert("COPY docsload FROM STDIN WITH CSV DELIMITER AS ';'", s) if curs.rowcount != pagecount: print("Loaded invalid number of rows! {} rows for {} pages!".format(curs.rowcount, pagecount)) sys.exit(1) numchanges = 0 # If the previous step succeeded, delete all the documentation for the specified version # and insert into / update the doc table the content that was loaded into the temporary table curs.execute("DELETE FROM docs WHERE version=%(version)s AND NOT EXISTS (SELECT 1 FROM docsload WHERE docsload.file=docs.file)", { 'version': ver, }) numchanges += curs.rowcount if not quiet: print("Deleted {} orphaned doc pages".format(curs.rowcount)) curs.execute("INSERT INTO docs (file, version, title, content) SELECT file, version, title, content FROM docsload WHERE NOT EXISTS (SELECT 1 FROM docs WHERE docs.file=docsload.file AND docs.version=%(version)s)", { 'version': ver, }) numchanges += curs.rowcount if not quiet: print("Inserted {} new doc pages.".format(curs.rowcount)) curs.execute("UPDATE docs SET title=l.title, content=l.content FROM docsload l WHERE docs.version=%(version)s AND docs.file=l.file AND (docs.title != l.title OR docs.content != l.content)", { 'version': ver, }) numchanges += curs.rowcount if not quiet: print("Updated {} changed doc pages.".format(curs.rowcount)) if numchanges > 0: # Update the docs loaded timestamp if ver == "0" and options.git: githash = options.git else: githash = '' curs.execute("UPDATE core_version SET docsloaded=CURRENT_TIMESTAMP, docsgit=%(git)s WHERE tree=%(v)s", { 'v': ver, 'git': githash, }) # Issue varnish purge for all docs of this version if ver == "0": # Special handling of developer docs... ver = "devel" curs.execute("SELECT varnish_purge_xkey('pgdocs_{}')".format(ver)) curs.execute("SELECT varnish_purge_xkey('pgdocs_all')") if iscurrent: curs.execute("SELECT varnish_purge_xkey('pgdocs_current')") # ensure the changes are committed, and close the connection connection.commit() connection.close() if not quiet: print("Done loading docs version %s (%i pages)." % (ver, pagecount))