Switch to using tidylib rather than tidy

tidylib (http://countergram.github.io/pytidylib/) is maintained, the old tidy one (https://cihar.com/software/utidylib/) is not. And in particular, python3 support is in the new one. Generates some minor changs in the existing archives, but it seems to be just whitespace and some actual incorrectness in the old output.
author: Magnus Hagander 2018-11-30 02:36:02 +0000
committer: Magnus Hagander 2018-11-30 02:36:02 +0000
commit: 33907ea5a6f953ca679196f491edfff42173ad3e (patch)
tree: 039c3a07797b643d03c845f8096633652fd74021 /loader/lib
parent: 68e8bb7775a846ce7fda3a70395e66778f36a0ed (diff)
1 files changed, 20 insertions, 3 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index 871cea7..51fb26d 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -6,7 +6,7 @@ from email.parser import Parser
 from email.header import decode_header
 from email.errors import HeaderParseError
 from HTMLParser import HTMLParser, HTMLParseError
-import tidy
+import tidylib
 import StringIO
 
 from lib.exception import IgnorableException
@@ -204,7 +204,7 @@ class ArchivesParser(object):
 		if b:
 			b = self.html_clean(b)
 			if b: return b
-		if b == '':
+		if b == '' or b is None:
 			hasempty = True
 
 		if hasempty:
@@ -483,7 +483,24 @@ class ArchivesParser(object):
 
 	def html_clean(self, html):
 		# First we pass it through tidy
-		html = unicode(str(tidy.parseString(html.encode('utf8'), drop_proprietary_attributes=1, alt_text='',hide_comments=1,output_xhtml=1,show_body_only=1,clean=1,char_encoding='utf8')), 'utf8')
+		(html, errors) = tidylib.tidy_document(html,
+											   options={
+												   'drop-proprietary-attributes': 1,
+												   'alt-text': '',
+												   'hide-comments': 1,
+												   'output-xhtml': 1,
+												   'show-body-only': 1,
+												   'clean': 1,
+												   'char-encoding': 'utf8',
+												   'show-warnings': 0,
+												   'show-info': 0,
+												   })
+		if errors:
+			print("HTML tidy failed for %s!" % self.msgid)
+			print(errors)
+			return None
+		if type(html) == str:
+			html = unicode(html, 'utf8')
 
 		try:
 			cleaner = HTMLCleaner()
author	Magnus Hagander	2018-11-30 02:36:02 +0000
committer	Magnus Hagander	2018-11-30 02:36:02 +0000
commit	33907ea5a6f953ca679196f491edfff42173ad3e (patch)
tree	039c3a07797b643d03c845f8096633652fd74021 /loader/lib
parent	68e8bb7775a846ce7fda3a70395e66778f36a0ed (diff)