tools/localhtmlvalidate/localhtmlvalidate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

#!/usr/bin/env python
#
# localhtmlvalidate.py - validate local HTML for pgweb
#
# This is a small tool that you run to validate the HTML output of your
# localhost pgweb installation against the W3C validator. Give it the
# localhost:8000 URL (or other, depending on what port you're running the
# local server on), and it will give you a list of possible issues with
# the page.
#
# In theory it can be used just fine for non-pgweb pages as well, but
# for obvious reasons the functionality to show the source line number
# based on the pgweb templates won't work.
#

import sys
import urllib
import httplib
import re
import HTMLParser

BOUNDARY = "-=--=foobar-=--="


def encode_multipart_formdata(fields, files):
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (key, filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        L.append('Content-Type: text/html')
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = "\r\n".join(L)
    return body


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: localhtmlvalidate.py <local url>")
        sys.exit(1)

    contents = urllib.urlopen(sys.argv[1]).read()

    # Try to figure out where the actual contents start :)
    try:
        firstline = contents.splitlines().index('<div id="pgContentWrap">')
    except ValueError:
        firstline = 0

    # Generate a form body
    body = encode_multipart_formdata(
        [
            ('charset', 'utf-8'),
            ('doctype', 'inline'),
            ('group', '0'),
            ('verbose', '1'),
        ],
        [('uploaded_file', 'test.html', contents)]
    )

    # Now submit it to the w3c validator
    h = httplib.HTTP("validator.w3.org")
    h.putrequest("POST", "/check")
    h.putheader("User-Agent: localcheck-tester/0.0")
    h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY)
    h.putheader("content-length", str(len(body)))
    h.endheaders()
    h.send(body)
    errcode, errmsg, headers = h.getreply()
    rbody = h.getfile().read()
    if headers['x-w3c-validator-status'] == 'Valid':
        print("Page validates!")
        sys.exit(0)
    elif headers['x-w3c-validator-status'] == 'Invalid':
        print("Invalid!")
        print("Errors: %s" % headers['x-w3c-validator-errors'])
        print("Warnings: %s" % headers['x-w3c-validator-warnings'])
        hp = HTMLParser.HTMLParser()
        for m in re.findall('<li class="msg_err">.*?</li>', rbody, re.DOTALL):
            r = re.search('<em>Line (\d+).*<span class="msg">(.*?)</span>', m, re.DOTALL)
            print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2))))

            r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', unicode(m, 'utf8'), re.DOTALL)
            if r2:
                s = "%s%s%s" % r2.groups()
                print("Source: %s" % hp.unescape(s).encode('utf-8'))
            print("")
    else:
        print("Unknown status: %s" % headers['x-w3c-validator-status'])
        print(headers)
        sys.exit(1)