diff options
author | Magnus Hagander | 2018-11-30 02:36:02 +0000 |
---|---|---|
committer | Magnus Hagander | 2018-11-30 02:36:02 +0000 |
commit | 33907ea5a6f953ca679196f491edfff42173ad3e (patch) | |
tree | 039c3a07797b643d03c845f8096633652fd74021 /loader/lib/parser.py | |
parent | 68e8bb7775a846ce7fda3a70395e66778f36a0ed (diff) |
Switch to using tidylib rather than tidy
tidylib (http://countergram.github.io/pytidylib/) is maintained,
the old tidy one (https://cihar.com/software/utidylib/) is not. And in
particular, python3 support is in the new one.
Generates some minor changs in the existing archives, but it seems to be
just whitespace and some actual incorrectness in the old output.
Diffstat (limited to 'loader/lib/parser.py')
-rw-r--r-- | loader/lib/parser.py | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py index 871cea7..51fb26d 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -6,7 +6,7 @@ from email.parser import Parser from email.header import decode_header from email.errors import HeaderParseError from HTMLParser import HTMLParser, HTMLParseError -import tidy +import tidylib import StringIO from lib.exception import IgnorableException @@ -204,7 +204,7 @@ class ArchivesParser(object): if b: b = self.html_clean(b) if b: return b - if b == '': + if b == '' or b is None: hasempty = True if hasempty: @@ -483,7 +483,24 @@ class ArchivesParser(object): def html_clean(self, html): # First we pass it through tidy - html = unicode(str(tidy.parseString(html.encode('utf8'), drop_proprietary_attributes=1, alt_text='',hide_comments=1,output_xhtml=1,show_body_only=1,clean=1,char_encoding='utf8')), 'utf8') + (html, errors) = tidylib.tidy_document(html, + options={ + 'drop-proprietary-attributes': 1, + 'alt-text': '', + 'hide-comments': 1, + 'output-xhtml': 1, + 'show-body-only': 1, + 'clean': 1, + 'char-encoding': 'utf8', + 'show-warnings': 0, + 'show-info': 0, + }) + if errors: + print("HTML tidy failed for %s!" % self.msgid) + print(errors) + return None + if type(html) == str: + html = unicode(html, 'utf8') try: cleaner = HTMLCleaner() |