summaryrefslogtreecommitdiff
path: root/loader/lib
diff options
context:
space:
mode:
authorMagnus Hagander2018-11-30 02:36:02 +0000
committerMagnus Hagander2018-11-30 02:36:02 +0000
commit33907ea5a6f953ca679196f491edfff42173ad3e (patch)
tree039c3a07797b643d03c845f8096633652fd74021 /loader/lib
parent68e8bb7775a846ce7fda3a70395e66778f36a0ed (diff)
Switch to using tidylib rather than tidy
tidylib (http://countergram.github.io/pytidylib/) is maintained, the old tidy one (https://cihar.com/software/utidylib/) is not. And in particular, python3 support is in the new one. Generates some minor changs in the existing archives, but it seems to be just whitespace and some actual incorrectness in the old output.
Diffstat (limited to 'loader/lib')
-rw-r--r--loader/lib/parser.py23
1 files changed, 20 insertions, 3 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index 871cea7..51fb26d 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -6,7 +6,7 @@ from email.parser import Parser
from email.header import decode_header
from email.errors import HeaderParseError
from HTMLParser import HTMLParser, HTMLParseError
-import tidy
+import tidylib
import StringIO
from lib.exception import IgnorableException
@@ -204,7 +204,7 @@ class ArchivesParser(object):
if b:
b = self.html_clean(b)
if b: return b
- if b == '':
+ if b == '' or b is None:
hasempty = True
if hasempty:
@@ -483,7 +483,24 @@ class ArchivesParser(object):
def html_clean(self, html):
# First we pass it through tidy
- html = unicode(str(tidy.parseString(html.encode('utf8'), drop_proprietary_attributes=1, alt_text='',hide_comments=1,output_xhtml=1,show_body_only=1,clean=1,char_encoding='utf8')), 'utf8')
+ (html, errors) = tidylib.tidy_document(html,
+ options={
+ 'drop-proprietary-attributes': 1,
+ 'alt-text': '',
+ 'hide-comments': 1,
+ 'output-xhtml': 1,
+ 'show-body-only': 1,
+ 'clean': 1,
+ 'char-encoding': 'utf8',
+ 'show-warnings': 0,
+ 'show-info': 0,
+ })
+ if errors:
+ print("HTML tidy failed for %s!" % self.msgid)
+ print(errors)
+ return None
+ if type(html) == str:
+ html = unicode(html, 'utf8')
try:
cleaner = HTMLCleaner()