summaryrefslogtreecommitdiff
path: root/loader/lib/parser.py
diff options
context:
space:
mode:
authorMagnus Hagander2019-01-03 10:04:29 +0000
committerMagnus Hagander2019-01-03 10:04:29 +0000
commitbb5775efe5f938461537e0c95c7c110875e4718b (patch)
treed6159fc5773caa5fe50c113c3790bd1bc63a7653 /loader/lib/parser.py
parent46372add400ffa5295969aa2e4e8c468d4ada937 (diff)
Update loader scripts to use python3 syntax
Some minor cleanups as well, but mostly just the output of the 2to3 tool and some manual changes.
Diffstat (limited to 'loader/lib/parser.py')
-rw-r--r--loader/lib/parser.py70
1 files changed, 39 insertions, 31 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index 15009c4..8ee25c5 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -2,23 +2,24 @@ import re
import datetime
import dateutil.parser
-from email.parser import Parser
-from email.header import decode_header
+from email.parser import BytesParser
+from email.header import decode_header, Header
from email.errors import HeaderParseError
-from HTMLParser import HTMLParser, HTMLParseError
+from email.policy import compat32
+from html.parser import HTMLParser
import tidylib
-import StringIO
+import io
from lib.exception import IgnorableException
from lib.log import log
class ArchivesParser(object):
def __init__(self):
- self.parser = Parser()
+ self.parser = BytesParser(policy=compat32)
def parse(self, stream):
self.rawtxt = stream.read()
- self.msg = self.parser.parse(StringIO.StringIO(self.rawtxt))
+ self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
def is_msgid(self, msgid):
# Look for a specific messageid. This means we might parse it twice,
@@ -26,7 +27,7 @@ class ArchivesParser(object):
try:
if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
return True
- except Exception, e:
+ except Exception as e:
return False
def analyze(self, date_override=None):
@@ -49,13 +50,13 @@ class ArchivesParser(object):
self.parents = []
# The first one is in-reply-to, if it exists
if self.get_optional('in-reply-to'):
- m = self.clean_messageid(self.get_optional('in-reply-to'), True)
+ m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
if m:
self.parents.append(m)
# Then we add all References values, in backwards order
if self.get_optional('references'):
- cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.get_optional('references').split())]
+ cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
# Can't do this with a simple self.parents.extend() due to broken
# mailers that add the same reference more than once. And we can't
# use a set() to make it unique, because order is very important
@@ -130,19 +131,19 @@ class ArchivesParser(object):
params = msg.get_params()
if not params:
# No content-type, so we assume us-ascii
- return unicode(b, 'us-ascii', errors='ignore')
+ return str(b, 'us-ascii', errors='ignore')
for k,v in params:
if k.lower() == 'charset':
charset = v
break
if charset:
try:
- return unicode(b, self.clean_charset(charset), errors='ignore')
- except LookupError, e:
+ return str(b, self.clean_charset(charset), errors='ignore')
+ except LookupError as e:
raise IgnorableException("Failed to get unicode payload: %s" % e)
else:
# XXX: reasonable default?
- return unicode(b, errors='ignore')
+ return str(b, errors='ignore')
# Return None or empty string, depending on what we got back
return b
@@ -154,8 +155,8 @@ class ArchivesParser(object):
if b:
# Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
# later reject..
- if b.find(u'\udbff\n\udef8'):
- b = b.replace(u'\udbff\n\udef8', '')
+ if b.find('\udbff\n\udef8'):
+ b = b.replace('\udbff\n\udef8', '')
# Remove postgres specific mail footer - if it's there
m = self._re_footer.match(b)
@@ -249,15 +250,15 @@ class ArchivesParser(object):
# If this is a header-encoded filename, start by decoding that
if filename.startswith('=?'):
decoded, encoding = decode_header(filename)[0]
- return unicode(decoded, encoding, errors='ignore')
+ return str(decoded, encoding, errors='ignore')
# If it's already unicode, just return it
- if isinstance(filename, unicode):
+ if isinstance(filename, str):
return filename
# Anything that's not UTF8, we just get rid of. We can live with
# filenames slightly mangled in this case.
- return unicode(filename, 'utf-8', errors='ignore')
+ return str(filename, 'utf-8', errors='ignore')
def _extract_filename(self, container):
# Try to get the filename for an attachment in the container.
@@ -324,7 +325,7 @@ class ArchivesParser(object):
# by majordomo with the footer. So if that one is present,
# we need to explicitly exclude it again.
b = container.get_payload(decode=True)
- if not self._re_footer.match(b):
+ if isinstance(b, str) and not self._re_footer.match(b):
# We know there is no name for this one
self.attachments.append((None, container.get_content_type(), b))
return
@@ -423,9 +424,14 @@ class ArchivesParser(object):
# enough...
dp = datetime.datetime(*dp.utctimetuple()[:6])
return dp
- except Exception, e:
+ except Exception as e:
raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
+ def _maybe_decode(self, s, charset):
+ if isinstance(s, str):
+ return s.strip(' ')
+ return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
+
# Workaround for broken quoting in some MUAs (see below)
_re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
def _decode_mime_header(self, hdr, email_workaround):
@@ -449,28 +455,32 @@ class ArchivesParser(object):
hdr = self._re_mailworkaround.sub(r'\1', hdr)
try:
- return " ".join([unicode(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore') for s,charset in decode_header(hdr)])
- except HeaderParseError, e:
+ return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
+ except HeaderParseError as e:
# Parser error is typically someone specifying an encoding,
# but then not actually using that encoding. We'll do the best
# we can, which is cut it down to ascii and ignore errors
- return unicode(hdr, 'us-ascii', errors='ignore')
+ return str(hdr, 'us-ascii', errors='ignore').strip(' ')
def decode_mime_header(self, hdr, email_workaround=False):
try:
+ if isinstance(hdr, Header):
+ hdr = hdr.encode()
+
h = self._decode_mime_header(hdr, email_workaround)
if h:
return h.replace("\0", "")
return ''
- except LookupError, e:
+ except LookupError as e:
raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
- except ValueError, ve:
+ except ValueError as ve:
raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
def get_mandatory(self, fieldname):
try:
x = self.msg[fieldname]
- if x==None: raise Exception()
+ if x==None:
+ raise Exception()
return x
except:
raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
@@ -496,17 +506,15 @@ class ArchivesParser(object):
'show-info': 0,
})
if errors:
- print("HTML tidy failed for %s!" % self.msgid)
+ print(("HTML tidy failed for %s!" % self.msgid))
print(errors)
return None
- if type(html) == str:
- html = unicode(html, 'utf8')
try:
cleaner = HTMLCleaner()
cleaner.feed(html)
return cleaner.get_text()
- except HTMLParseError, e:
+ except Exception as e:
# Failed to parse the html, thus failed to clean it. so we must
# give up...
return None
@@ -515,7 +523,7 @@ class ArchivesParser(object):
class HTMLCleaner(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
- self.io = StringIO.StringIO()
+ self.io = io.StringIO()
def get_text(self):
return self.io.getvalue()