diff options
Diffstat (limited to 'tools/docs/docload.py')
| -rwxr-xr-x | tools/docs/docload.py | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/tools/docs/docload.py b/tools/docs/docload.py new file mode 100755 index 00000000..5abc2f73 --- /dev/null +++ b/tools/docs/docload.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +# Script to load documentation from tarballs + +import sys +import os +import tarfile +import re +import tidy +import psycopg2 + +pagecount = 0 + +re_titlematch = re.compile('<title\s*>([^<]+)</title\s*>', re.IGNORECASE) + +## Load a single page +def load_doc_file(filename, f): + tidyopts = dict(drop_proprietary_attributes=1, + alt_text='', + hide_comments=1, + output_xhtml=1, + show_body_only=1, + clean=1, + char_encoding='utf8', + indent='auto', + ) + + contents = unicode(f.read(),'latin1') + tm = re_titlematch.search(contents) + if tm: + title = tm.group(1) + else: + title = "" + print "--- file: %s (%s) ---" % (filename, title) + + s = tidy.parseString(contents.encode('utf-8'), **tidyopts) + curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{ + 'f': filename, + 'v': ver, + 't': title, + 'c': str(s), + }) + global pagecount + pagecount += 1 + +## Your typical usage message +def Usage(): + print "Usage: docload.py <version> <tarfile>" + sys.exit(1) + +## Main execution + +if len(sys.argv) != 3: + Usage() + +ver = sys.argv[1] +tarfilename = sys.argv[2] + +if not os.path.isfile(tarfilename): + print "File %s not found" % tarfilename + sys.exit(1) + +tf = tarfile.open(tarfilename) + +db = psycopg2.connect('host=/tmp dbname=pgweb') +curs = db.cursor() +# Remove any old docs for this version (still protected by a transaction while +# we perform the load) +curs.execute("DELETE FROM docs WHERE version=%(v)s", {'v': ver}) + + +re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*') +re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$') +for member in tf: + if re_htmlfile.match(member.name): + load_doc_file(os.path.basename(member.name), tf.extractfile(member)) + if re_tarfile.match(member.name): + f = tf.extractfile(member) + inner_tar = tarfile.open(fileobj=f) + for inner_member in inner_tar: + if inner_member.name.endswith('.html') or inner_member.name.endswith('.htm'): + load_doc_file(inner_member.name, inner_tar.extractfile(inner_member)) +tf.close() + +db.commit() +print "Done (%i pages)." % pagecount + |
