summaryrefslogtreecommitdiff
path: root/tools/docs
diff options
context:
space:
mode:
authorMagnus Hagander2009-09-14 12:39:25 +0000
committerMagnus Hagander2009-09-14 12:39:25 +0000
commit90b758c247ad4f630f1775c6154daaef62284f52 (patch)
tree800d94792715e6b57d14cc7da618a771abb036e9 /tools/docs
A first very basic import.
Contains basic functionality, and an import of most of the static content from the old site. There is still plenty more to do...
Diffstat (limited to 'tools/docs')
-rwxr-xr-xtools/docs/docload.py87
1 files changed, 87 insertions, 0 deletions
diff --git a/tools/docs/docload.py b/tools/docs/docload.py
new file mode 100755
index 00000000..5abc2f73
--- /dev/null
+++ b/tools/docs/docload.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+# Script to load documentation from tarballs
+
+import sys
+import os
+import tarfile
+import re
+import tidy
+import psycopg2
+
+pagecount = 0
+
+re_titlematch = re.compile('<title\s*>([^<]+)</title\s*>', re.IGNORECASE)
+
+## Load a single page
+def load_doc_file(filename, f):
+ tidyopts = dict(drop_proprietary_attributes=1,
+ alt_text='',
+ hide_comments=1,
+ output_xhtml=1,
+ show_body_only=1,
+ clean=1,
+ char_encoding='utf8',
+ indent='auto',
+ )
+
+ contents = unicode(f.read(),'latin1')
+ tm = re_titlematch.search(contents)
+ if tm:
+ title = tm.group(1)
+ else:
+ title = ""
+ print "--- file: %s (%s) ---" % (filename, title)
+
+ s = tidy.parseString(contents.encode('utf-8'), **tidyopts)
+ curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{
+ 'f': filename,
+ 'v': ver,
+ 't': title,
+ 'c': str(s),
+ })
+ global pagecount
+ pagecount += 1
+
+## Your typical usage message
+def Usage():
+ print "Usage: docload.py <version> <tarfile>"
+ sys.exit(1)
+
+## Main execution
+
+if len(sys.argv) != 3:
+ Usage()
+
+ver = sys.argv[1]
+tarfilename = sys.argv[2]
+
+if not os.path.isfile(tarfilename):
+ print "File %s not found" % tarfilename
+ sys.exit(1)
+
+tf = tarfile.open(tarfilename)
+
+db = psycopg2.connect('host=/tmp dbname=pgweb')
+curs = db.cursor()
+# Remove any old docs for this version (still protected by a transaction while
+# we perform the load)
+curs.execute("DELETE FROM docs WHERE version=%(v)s", {'v': ver})
+
+
+re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*')
+re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$')
+for member in tf:
+ if re_htmlfile.match(member.name):
+ load_doc_file(os.path.basename(member.name), tf.extractfile(member))
+ if re_tarfile.match(member.name):
+ f = tf.extractfile(member)
+ inner_tar = tarfile.open(fileobj=f)
+ for inner_member in inner_tar:
+ if inner_member.name.endswith('.html') or inner_member.name.endswith('.htm'):
+ load_doc_file(inner_member.name, inner_tar.extractfile(inner_member))
+tf.close()
+
+db.commit()
+print "Done (%i pages)." % pagecount
+