summaryrefslogtreecommitdiff
path: root/loader/lib/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'loader/lib/parser.py')
-rw-r--r--loader/lib/parser.py1102
1 files changed, 551 insertions, 551 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index b97c8b3..81192da 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -14,560 +14,560 @@ from lib.exception import IgnorableException
from lib.log import log
class ArchivesParser(object):
- def __init__(self):
- self.parser = BytesParser(policy=compat32)
-
- def parse(self, stream):
- self.rawtxt = stream.read()
- self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
-
- def is_msgid(self, msgid):
- # Look for a specific messageid. This means we might parse it twice,
- # but so be it. Any exception means we know it's not this one...
- try:
- if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
- return True
- except Exception as e:
- return False
-
- def analyze(self, date_override=None):
- self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
- self._from = self.decode_mime_header(self.get_mandatory('From'), True)
- self.to = self.decode_mime_header(self.get_optional('To'), True)
- self.cc = self.decode_mime_header(self.get_optional('CC'), True)
- self.subject = self.decode_mime_header(self.get_optional('Subject'))
- if date_override:
- self.date = self.forgiving_date_decode(date_override)
- else:
- self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
- self.bodytxt = self.get_body()
- self.attachments = []
- self.get_attachments()
- if len(self.attachments) > 0:
- log.status("Found %s attachments" % len(self.attachments))
-
- # Build an list of the message id's we are interested in
- self.parents = []
- # The first one is in-reply-to, if it exists
- if self.get_optional('in-reply-to'):
- m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
- if m:
- self.parents.append(m)
-
- # Then we add all References values, in backwards order
- if self.get_optional('references'):
- cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
- # Can't do this with a simple self.parents.extend() due to broken
- # mailers that add the same reference more than once. And we can't
- # use a set() to make it unique, because order is very important
- for m in cleaned_msgids:
- if m and not m in self.parents:
- self.parents.append(m)
-
-
- def clean_charset(self, charset):
- lcharset = charset.lower()
- if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
- # Special case where we don't know... We'll assume
- # us-ascii and use replacements
- return 'us-ascii'
- if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
- # Seriously broken charset definitions, map to us-ascii
- # and throw away the rest with replacements
- return 'us-ascii'
- if lcharset == 'x-gbk':
- # Some MUAs set it to x-gbk, but there is a valid
- # declaratoin as gbk...
- return 'gbk'
- if lcharset == 'iso-8859-8-i':
- # -I is a special logical version, but should be the
- # same charset
- return 'iso-8859-8'
- if lcharset == 'windows-874':
- # This is an alias for iso-8859-11
- return 'iso-8859-11'
- if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
- # Strange way of saying 8859....
- return 'iso-8859-1'
- if lcharset == 'iso885915':
- return 'iso-8859-15'
- if lcharset == 'iso-latin-2':
- return 'iso-8859-2'
- if lcharset == 'iso-850':
- # Strange spelling of cp850 (windows charset)
- return 'cp850'
- if lcharset == 'koi8r':
- return 'koi8-r'
- if lcharset == 'cp 1252':
- return 'cp1252'
- if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
- # Why did this show up more than once?!
- return 'iso-8859-1'
- if lcharset == 'x-windows-949':
- return 'ms949'
- if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
- # This is a locale, and not a charset, but most likely it's this one
- return 'iso-8859-1'
- if lcharset == 'iso-8858-15':
- # How is this a *common* mistake?
- return 'iso-8859-15'
- if lcharset == 'macintosh':
- return 'mac_roman'
- if lcharset == 'cn-big5':
- return 'big5'
- if lcharset == 'x-unicode-2-0-utf-7':
- return 'utf-7'
- if lcharset == 'tscii':
- # No support for this charset :S Map it down to ascii
- # and throw away all the rest. sucks, but we have to
- return 'us-ascii'
- return charset
-
- def get_payload_as_unicode(self, msg):
- try:
- b = msg.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore it and hope for a better MIME part later.
- b = None
-
- if b:
- # Find out if there is a charset
- charset = None
- params = msg.get_params()
- if not params:
- # No content-type, so we assume us-ascii
- return str(b, 'us-ascii', errors='ignore')
- for k,v in params:
- if k.lower() == 'charset':
- charset = v
- break
- if charset:
- try:
- return str(b, self.clean_charset(charset), errors='ignore')
- except LookupError as e:
- raise IgnorableException("Failed to get unicode payload: %s" % e)
- else:
- # XXX: reasonable default?
- return str(b, errors='ignore')
- # Return None or empty string, depending on what we got back
- return b
-
- # Regular expression matching the PostgreSQL custom mail footer that
- # is appended to all emails.
- _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
- def get_body(self):
- b = self._get_body()
- if b:
- # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
- # later reject..
- if b.find('\udbff\n\udef8'):
- b = b.replace('\udbff\n\udef8', '')
-
- # Remove postgres specific mail footer - if it's there
- m = self._re_footer.match(b)
- if m:
- b = m.group(1)
-
- # Sometimes we end up with a trailing \0 when decoding long strings, so
- # replace it if it's there.
- # In fact, replace it everywhere, since it can also turn up in the middle
- # of a text when it's a really broken decoding.
- b = b.replace('\0', '')
-
- return b
-
- def _get_body(self):
- # This is where the magic happens - try to figure out what the body
- # of this message should render as.
- hasempty = False
-
- # First see if this is a single-part message that we can just
- # decode and go.
- b = self.get_payload_as_unicode(self.msg)
- if b: return b
- if b == '':
- # We found something, but it was empty. We'll keep looking as
- # there might be something better available, but make a note
- # that empty exists.
- hasempty = True
-
- # Ok, it's multipart. Find the first part that is text/plain,
- # and use that one. Do this recursively, since we may have something
- # like:
- # multipart/mixed:
- # multipart/alternative:
- # text/plain
- # text/html
- # application/octet-stream (attachment)
- b = self.recursive_first_plaintext(self.msg)
- if b: return b
- if b == '':
- hasempty = True
-
- # Couldn't find a plaintext. Look for the first HTML in that case.
- # Fallback, but what can we do at this point...
- b = self.recursive_first_plaintext(self.msg, True)
- if b:
- b = self.html_clean(b)
- if b: return b
- if b == '' or b is None:
- hasempty = True
-
- if hasempty:
- log.status('Found empty body in %s' % self.msgid)
- return ''
- raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
-
- def recursive_first_plaintext(self, container, html_instead=False):
- pl = container.get_payload()
- if isinstance(pl, str):
- # This was not a multipart, but it leaked... Give up!
- return None
- for p in pl:
- if p.get_params() == None:
- # MIME multipart/mixed, but no MIME type on the part
- log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
- return self.get_payload_as_unicode(p)
- if p.get_params()[0][0].lower() == 'text/plain':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if html_instead and p.get_params()[0][0].lower() == 'text/html':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if p.is_multipart():
- b = self.recursive_first_plaintext(p, html_instead)
- if b or b == '': return b
-
- # Yikes, nothing here! Hopefully we'll find something when
- # we continue looping at a higher level.
- return None
-
- def get_attachments(self):
- self.attachments_found_first_plaintext = False
- self.recursive_get_attachments(self.msg)
-
- # Clean a filenames encoding and return it as a unicode string
- def _clean_filename_encoding(self, filename):
- # If this is a header-encoded filename, start by decoding that
- if filename.startswith('=?'):
- decoded, encoding = decode_header(filename)[0]
- return str(decoded, encoding, errors='ignore')
-
- # If it's already unicode, just return it
- if isinstance(filename, str):
- return filename
-
- # Anything that's not UTF8, we just get rid of. We can live with
- # filenames slightly mangled in this case.
- return str(filename, 'utf-8', errors='ignore')
-
- def _extract_filename(self, container):
- # Try to get the filename for an attachment in the container.
- # If the standard library can figure one out, use that one.
- f = container.get_filename()
- if f: return self._clean_filename_encoding(f)
-
- # Failing that, some mailers set Content-Description to the
- # filename
- if 'Content-Description' in container:
- return self._clean_filename_encoding(container['Content-Description'])
- return None
-
- def recursive_get_attachments(self, container):
- # We start recursion in the "multipart" container if any
- if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
- # Multipart - worth scanning into
- if not container.is_multipart():
- # Wow, this is broken. It's multipart/mixed, but doesn't
- # contain multiple parts.
- # Since we're just looking for attachments, let's just
- # ignore it...
- return
- for p in container.get_payload():
- if p.get_params() == None:
- continue
- self.recursive_get_attachments(p)
- elif container.get_content_type() == 'multipart/alternative':
- # Alternative is not an attachment (we decide)
- # It's typilcally plantext + html
- self.attachments_found_first_plaintext = True
- return
- elif container.is_multipart():
- # Other kinds of multipart, such as multipart/signed...
- return
- else:
- # Not a multipart.
- # Exclude specific contenttypes
- if container.get_content_type() == 'application/pgp-signature':
- return
- if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
- return
- # For now, accept anything not text/plain
- if container.get_content_type() != 'text/plain':
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
- return
-
- # It's a text/plain, it might be worthwhile.
- # If it has a name, we consider it an attachments
- if not container.get_params():
- return
- for k,v in container.get_params():
- if k=='name' and v != '':
- # Yes, it has a name
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If it's content-disposition=attachment, we also want to save it
- if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If we have already found one text/plain part, make all
- # further text/plain parts attachments
- if self.attachments_found_first_plaintext:
- # However, this will also *always* catch the MIME part added
- # by majordomo with the footer. So if that one is present,
- # we need to explicitly exclude it again.
- try:
- b = container.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- if isinstance(b, str) and not self._re_footer.match(b):
- # We know there is no name for this one
- self.attachments.append((None, container.get_content_type(), b))
- return
-
- # Ok, so this was a plaintext that we ignored. Set the flag
- # that we have now ignored one, so we'll make the next one
- # an attachment.
- self.attachments_found_first_plaintext = True
- # No name, and text/plain, so ignore it
-
- re_msgid = re.compile('^\s*<(.*)>\s*')
- def clean_messageid(self, messageid, ignorebroken=False):
- m = self.re_msgid.match(messageid)
- if not m:
- if ignorebroken:
- log.status("Could not parse messageid '%s', ignoring it" % messageid)
- return None
- raise IgnorableException("Could not parse message id '%s'" % messageid)
- return m.groups(1)[0].replace(' ','')
-
-# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
- # Now using [^\s] instead of \w, to work with japanese chars
- _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
- _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
- _date_multiminus_re = re.compile(' -(-\d+)$')
- _date_offsetnoplus_re = re.compile(' (\d{4})$')
- def forgiving_date_decode(self, d):
- if d.strip() == '':
- raise IgnorableException("Failed to parse empty date")
- # Strange timezones requiring manual adjustments
- if d.endswith('-7700 (EST)'):
- d = d.replace('-7700 (EST)', 'EST')
- if d.endswith('+6700 (EST)'):
- d = d.replace('+6700 (EST)', 'EST')
- if d.endswith('+-4-30'):
- d = d.replace('+-4-30', '+0430')
- if d.endswith('+1.00'):
- d = d.replace('+1.00', '+0100')
- if d.endswith('+-100'):
- d = d.replace('+-100', '+0100')
- if d.endswith('+500'):
- d = d.replace('+500', '+0500')
- if d.endswith('-500'):
- d = d.replace('-500', '-0500')
- if d.endswith('-700'):
- d = d.replace('-700', '-0700')
- if d.endswith('-800'):
- d = d.replace('-800', '-0800')
- if d.endswith('+05-30'):
- d = d.replace('+05-30', '+0530')
- if d.endswith('+0-900'):
- d = d.replace('+0-900', '-0900')
- if d.endswith('Mexico/General'):
- d = d.replace('Mexico/General','CDT')
- if d.endswith('Pacific Daylight Time'):
- d = d.replace('Pacific Daylight Time', 'PDT')
- if d.endswith(' ZE2'):
- d = d.replace(' ZE2',' +0200')
- if d.find('-Juin-') > 0:
- d = d.replace('-Juin-','-Jun-')
- if d.find('-Juil-') > 0:
- d = d.replace('-Juil-','-Jul-')
- if d.find(' 0 (GMT)'):
- d = d.replace(' 0 (GMT)',' +0000')
-
- if self._date_multiminus_re.search(d):
- d = self._date_multiminus_re.sub(' \\1', d)
-
- if self._date_offsetnoplus_re.search(d):
- d = self._date_offsetnoplus_re.sub('+\\1', d)
-
-
- # We have a number of dates in the format
- # "<full datespace> +0200 (MET DST)"
- # or similar. The problem coming from the space within the
- # parenthesis, or if the contents of the parenthesis is
- # completely empty
- if self._date_multi_re.search(d):
- d = self._date_multi_re.sub('', d)
-
- # If the spec is instead
- # "<full datespace> +0200 (...)"
- # of any kind, we can just remove what's in the (), because the
- # parser is just going to rely on the fixed offset anyway.
- if self._date_multi_re2.search(d):
- d = self._date_multi_re2.sub(' \\1', d)
-
- try:
- dp = dateutil.parser.parse(d, fuzzy=True)
-
- # Some offsets are >16 hours, which postgresql will not
- # (for good reasons) accept
- if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
- # Convert it to a UTC timestamp using Python. It will give
- # us the right time, but the wrong timezone. Should be
- # enough...
- dp = datetime.datetime(*dp.utctimetuple()[:6])
- return dp
- except Exception as e:
- raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
-
- def _maybe_decode(self, s, charset):
- if isinstance(s, str):
- return s.strip(' ')
- return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
-
- # Workaround for broken quoting in some MUAs (see below)
- _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
- def _decode_mime_header(self, hdr, email_workaround):
- if hdr == None:
- return None
-
- # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
- # we must get rid of the sequence \n\t at least in the header. If we
- # do this *before* doing any MIME decoding, we should be safe against
- # anybody *actually* putting that sequence in the header (since we
- # won't match the encoded contents)
- hdr = hdr.replace("\n\t"," ")
-
- # In at least some cases, at least gmail (and possibly other MUAs)
- # incorrectly put double quotes in the name/email field even when
- # it's encoded. That's not allowed - they have to be escaped - but
- # since there's a fair amount of those, we apply a regex to get
- # rid of them.
- m = self._re_mailworkaround.search(hdr)
- if m:
- hdr = self._re_mailworkaround.sub(r'\1', hdr)
-
- try:
- return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
- except HeaderParseError as e:
- # Parser error is typically someone specifying an encoding,
- # but then not actually using that encoding. We'll do the best
- # we can, which is cut it down to ascii and ignore errors
- return str(hdr, 'us-ascii', errors='ignore').strip(' ')
-
- def decode_mime_header(self, hdr, email_workaround=False):
- try:
- if isinstance(hdr, Header):
- hdr = hdr.encode()
-
- h = self._decode_mime_header(hdr, email_workaround)
- if h:
- return h.replace("\0", "")
- return ''
- except LookupError as e:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
- except ValueError as ve:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
-
- def get_mandatory(self, fieldname):
- try:
- x = self.msg[fieldname]
- if x==None:
- raise Exception()
- return x
- except:
- raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
-
- def get_optional(self, fieldname):
- try:
- return self.msg[fieldname]
- except:
- return ''
-
- def html_clean(self, html):
- # First we pass it through tidy
- (html, errors) = tidylib.tidy_document(html,
- options={
- 'drop-proprietary-attributes': 1,
- 'alt-text': '',
- 'hide-comments': 1,
- 'output-xhtml': 1,
- 'show-body-only': 1,
- 'clean': 1,
- 'char-encoding': 'utf8',
- 'show-warnings': 0,
- 'show-info': 0,
- })
- if errors:
- print(("HTML tidy failed for %s!" % self.msgid))
- print(errors)
- return None
-
- try:
- cleaner = HTMLCleaner()
- cleaner.feed(html)
- return cleaner.get_text()
- except Exception as e:
- # Failed to parse the html, thus failed to clean it. so we must
- # give up...
- return None
+ def __init__(self):
+ self.parser = BytesParser(policy=compat32)
+
+ def parse(self, stream):
+ self.rawtxt = stream.read()
+ self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
+
+ def is_msgid(self, msgid):
+ # Look for a specific messageid. This means we might parse it twice,
+ # but so be it. Any exception means we know it's not this one...
+ try:
+ if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
+ return True
+ except Exception as e:
+ return False
+
+ def analyze(self, date_override=None):
+ self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
+ self._from = self.decode_mime_header(self.get_mandatory('From'), True)
+ self.to = self.decode_mime_header(self.get_optional('To'), True)
+ self.cc = self.decode_mime_header(self.get_optional('CC'), True)
+ self.subject = self.decode_mime_header(self.get_optional('Subject'))
+ if date_override:
+ self.date = self.forgiving_date_decode(date_override)
+ else:
+ self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
+ self.bodytxt = self.get_body()
+ self.attachments = []
+ self.get_attachments()
+ if len(self.attachments) > 0:
+ log.status("Found %s attachments" % len(self.attachments))
+
+ # Build an list of the message id's we are interested in
+ self.parents = []
+ # The first one is in-reply-to, if it exists
+ if self.get_optional('in-reply-to'):
+ m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
+ if m:
+ self.parents.append(m)
+
+ # Then we add all References values, in backwards order
+ if self.get_optional('references'):
+ cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
+ # Can't do this with a simple self.parents.extend() due to broken
+ # mailers that add the same reference more than once. And we can't
+ # use a set() to make it unique, because order is very important
+ for m in cleaned_msgids:
+ if m and not m in self.parents:
+ self.parents.append(m)
+
+
+ def clean_charset(self, charset):
+ lcharset = charset.lower()
+ if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
+ # Special case where we don't know... We'll assume
+ # us-ascii and use replacements
+ return 'us-ascii'
+ if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
+ # Seriously broken charset definitions, map to us-ascii
+ # and throw away the rest with replacements
+ return 'us-ascii'
+ if lcharset == 'x-gbk':
+ # Some MUAs set it to x-gbk, but there is a valid
+ # declaratoin as gbk...
+ return 'gbk'
+ if lcharset == 'iso-8859-8-i':
+ # -I is a special logical version, but should be the
+ # same charset
+ return 'iso-8859-8'
+ if lcharset == 'windows-874':
+ # This is an alias for iso-8859-11
+ return 'iso-8859-11'
+ if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
+ # Strange way of saying 8859....
+ return 'iso-8859-1'
+ if lcharset == 'iso885915':
+ return 'iso-8859-15'
+ if lcharset == 'iso-latin-2':
+ return 'iso-8859-2'
+ if lcharset == 'iso-850':
+ # Strange spelling of cp850 (windows charset)
+ return 'cp850'
+ if lcharset == 'koi8r':
+ return 'koi8-r'
+ if lcharset == 'cp 1252':
+ return 'cp1252'
+ if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
+ # Why did this show up more than once?!
+ return 'iso-8859-1'
+ if lcharset == 'x-windows-949':
+ return 'ms949'
+ if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
+ # This is a locale, and not a charset, but most likely it's this one
+ return 'iso-8859-1'
+ if lcharset == 'iso-8858-15':
+ # How is this a *common* mistake?
+ return 'iso-8859-15'
+ if lcharset == 'macintosh':
+ return 'mac_roman'
+ if lcharset == 'cn-big5':
+ return 'big5'
+ if lcharset == 'x-unicode-2-0-utf-7':
+ return 'utf-7'
+ if lcharset == 'tscii':
+ # No support for this charset :S Map it down to ascii
+ # and throw away all the rest. sucks, but we have to
+ return 'us-ascii'
+ return charset
+
+ def get_payload_as_unicode(self, msg):
+ try:
+ b = msg.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore it and hope for a better MIME part later.
+ b = None
+
+ if b:
+ # Find out if there is a charset
+ charset = None
+ params = msg.get_params()
+ if not params:
+ # No content-type, so we assume us-ascii
+ return str(b, 'us-ascii', errors='ignore')
+ for k,v in params:
+ if k.lower() == 'charset':
+ charset = v
+ break
+ if charset:
+ try:
+ return str(b, self.clean_charset(charset), errors='ignore')
+ except LookupError as e:
+ raise IgnorableException("Failed to get unicode payload: %s" % e)
+ else:
+ # XXX: reasonable default?
+ return str(b, errors='ignore')
+ # Return None or empty string, depending on what we got back
+ return b
+
+ # Regular expression matching the PostgreSQL custom mail footer that
+ # is appended to all emails.
+ _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
+ def get_body(self):
+ b = self._get_body()
+ if b:
+ # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
+ # later reject..
+ if b.find('\udbff\n\udef8'):
+ b = b.replace('\udbff\n\udef8', '')
+
+ # Remove postgres specific mail footer - if it's there
+ m = self._re_footer.match(b)
+ if m:
+ b = m.group(1)
+
+ # Sometimes we end up with a trailing \0 when decoding long strings, so
+ # replace it if it's there.
+ # In fact, replace it everywhere, since it can also turn up in the middle
+ # of a text when it's a really broken decoding.
+ b = b.replace('\0', '')
+
+ return b
+
+ def _get_body(self):
+ # This is where the magic happens - try to figure out what the body
+ # of this message should render as.
+ hasempty = False
+
+ # First see if this is a single-part message that we can just
+ # decode and go.
+ b = self.get_payload_as_unicode(self.msg)
+ if b: return b
+ if b == '':
+ # We found something, but it was empty. We'll keep looking as
+ # there might be something better available, but make a note
+ # that empty exists.
+ hasempty = True
+
+ # Ok, it's multipart. Find the first part that is text/plain,
+ # and use that one. Do this recursively, since we may have something
+ # like:
+ # multipart/mixed:
+ # multipart/alternative:
+ # text/plain
+ # text/html
+ # application/octet-stream (attachment)
+ b = self.recursive_first_plaintext(self.msg)
+ if b: return b
+ if b == '':
+ hasempty = True
+
+ # Couldn't find a plaintext. Look for the first HTML in that case.
+ # Fallback, but what can we do at this point...
+ b = self.recursive_first_plaintext(self.msg, True)
+ if b:
+ b = self.html_clean(b)
+ if b: return b
+ if b == '' or b is None:
+ hasempty = True
+
+ if hasempty:
+ log.status('Found empty body in %s' % self.msgid)
+ return ''
+ raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
+
+ def recursive_first_plaintext(self, container, html_instead=False):
+ pl = container.get_payload()
+ if isinstance(pl, str):
+ # This was not a multipart, but it leaked... Give up!
+ return None
+ for p in pl:
+ if p.get_params() == None:
+ # MIME multipart/mixed, but no MIME type on the part
+ log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
+ return self.get_payload_as_unicode(p)
+ if p.get_params()[0][0].lower() == 'text/plain':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if html_instead and p.get_params()[0][0].lower() == 'text/html':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if p.is_multipart():
+ b = self.recursive_first_plaintext(p, html_instead)
+ if b or b == '': return b
+
+ # Yikes, nothing here! Hopefully we'll find something when
+ # we continue looping at a higher level.
+ return None
+
+ def get_attachments(self):
+ self.attachments_found_first_plaintext = False
+ self.recursive_get_attachments(self.msg)
+
+ # Clean a filenames encoding and return it as a unicode string
+ def _clean_filename_encoding(self, filename):
+ # If this is a header-encoded filename, start by decoding that
+ if filename.startswith('=?'):
+ decoded, encoding = decode_header(filename)[0]
+ return str(decoded, encoding, errors='ignore')
+
+ # If it's already unicode, just return it
+ if isinstance(filename, str):
+ return filename
+
+ # Anything that's not UTF8, we just get rid of. We can live with
+ # filenames slightly mangled in this case.
+ return str(filename, 'utf-8', errors='ignore')
+
+ def _extract_filename(self, container):
+ # Try to get the filename for an attachment in the container.
+ # If the standard library can figure one out, use that one.
+ f = container.get_filename()
+ if f: return self._clean_filename_encoding(f)
+
+ # Failing that, some mailers set Content-Description to the
+ # filename
+ if 'Content-Description' in container:
+ return self._clean_filename_encoding(container['Content-Description'])
+ return None
+
+ def recursive_get_attachments(self, container):
+ # We start recursion in the "multipart" container if any
+ if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
+ # Multipart - worth scanning into
+ if not container.is_multipart():
+ # Wow, this is broken. It's multipart/mixed, but doesn't
+ # contain multiple parts.
+ # Since we're just looking for attachments, let's just
+ # ignore it...
+ return
+ for p in container.get_payload():
+ if p.get_params() == None:
+ continue
+ self.recursive_get_attachments(p)
+ elif container.get_content_type() == 'multipart/alternative':
+ # Alternative is not an attachment (we decide)
+ # It's typilcally plantext + html
+ self.attachments_found_first_plaintext = True
+ return
+ elif container.is_multipart():
+ # Other kinds of multipart, such as multipart/signed...
+ return
+ else:
+ # Not a multipart.
+ # Exclude specific contenttypes
+ if container.get_content_type() == 'application/pgp-signature':
+ return
+ if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
+ return
+ # For now, accept anything not text/plain
+ if container.get_content_type() != 'text/plain':
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+ return
+
+ # It's a text/plain, it might be worthwhile.
+ # If it has a name, we consider it an attachments
+ if not container.get_params():
+ return
+ for k,v in container.get_params():
+ if k=='name' and v != '':
+ # Yes, it has a name
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If it's content-disposition=attachment, we also want to save it
+ if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If we have already found one text/plain part, make all
+ # further text/plain parts attachments
+ if self.attachments_found_first_plaintext:
+ # However, this will also *always* catch the MIME part added
+ # by majordomo with the footer. So if that one is present,
+ # we need to explicitly exclude it again.
+ try:
+ b = container.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ if isinstance(b, str) and not self._re_footer.match(b):
+ # We know there is no name for this one
+ self.attachments.append((None, container.get_content_type(), b))
+ return
+
+ # Ok, so this was a plaintext that we ignored. Set the flag
+ # that we have now ignored one, so we'll make the next one
+ # an attachment.
+ self.attachments_found_first_plaintext = True
+ # No name, and text/plain, so ignore it
+
+ re_msgid = re.compile('^\s*<(.*)>\s*')
+ def clean_messageid(self, messageid, ignorebroken=False):
+ m = self.re_msgid.match(messageid)
+ if not m:
+ if ignorebroken:
+ log.status("Could not parse messageid '%s', ignoring it" % messageid)
+ return None
+ raise IgnorableException("Could not parse message id '%s'" % messageid)
+ return m.groups(1)[0].replace(' ','')
+
+# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
+ # Now using [^\s] instead of \w, to work with japanese chars
+ _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
+ _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
+ _date_multiminus_re = re.compile(' -(-\d+)$')
+ _date_offsetnoplus_re = re.compile(' (\d{4})$')
+ def forgiving_date_decode(self, d):
+ if d.strip() == '':
+ raise IgnorableException("Failed to parse empty date")
+ # Strange timezones requiring manual adjustments
+ if d.endswith('-7700 (EST)'):
+ d = d.replace('-7700 (EST)', 'EST')
+ if d.endswith('+6700 (EST)'):
+ d = d.replace('+6700 (EST)', 'EST')
+ if d.endswith('+-4-30'):
+ d = d.replace('+-4-30', '+0430')
+ if d.endswith('+1.00'):
+ d = d.replace('+1.00', '+0100')
+ if d.endswith('+-100'):
+ d = d.replace('+-100', '+0100')
+ if d.endswith('+500'):
+ d = d.replace('+500', '+0500')
+ if d.endswith('-500'):
+ d = d.replace('-500', '-0500')
+ if d.endswith('-700'):
+ d = d.replace('-700', '-0700')
+ if d.endswith('-800'):
+ d = d.replace('-800', '-0800')
+ if d.endswith('+05-30'):
+ d = d.replace('+05-30', '+0530')
+ if d.endswith('+0-900'):
+ d = d.replace('+0-900', '-0900')
+ if d.endswith('Mexico/General'):
+ d = d.replace('Mexico/General','CDT')
+ if d.endswith('Pacific Daylight Time'):
+ d = d.replace('Pacific Daylight Time', 'PDT')
+ if d.endswith(' ZE2'):
+ d = d.replace(' ZE2',' +0200')
+ if d.find('-Juin-') > 0:
+ d = d.replace('-Juin-','-Jun-')
+ if d.find('-Juil-') > 0:
+ d = d.replace('-Juil-','-Jul-')
+ if d.find(' 0 (GMT)'):
+ d = d.replace(' 0 (GMT)',' +0000')
+
+ if self._date_multiminus_re.search(d):
+ d = self._date_multiminus_re.sub(' \\1', d)
+
+ if self._date_offsetnoplus_re.search(d):
+ d = self._date_offsetnoplus_re.sub('+\\1', d)
+
+
+ # We have a number of dates in the format
+ # "<full datespace> +0200 (MET DST)"
+ # or similar. The problem coming from the space within the
+ # parenthesis, or if the contents of the parenthesis is
+ # completely empty
+ if self._date_multi_re.search(d):
+ d = self._date_multi_re.sub('', d)
+
+ # If the spec is instead
+ # "<full datespace> +0200 (...)"
+ # of any kind, we can just remove what's in the (), because the
+ # parser is just going to rely on the fixed offset anyway.
+ if self._date_multi_re2.search(d):
+ d = self._date_multi_re2.sub(' \\1', d)
+
+ try:
+ dp = dateutil.parser.parse(d, fuzzy=True)
+
+ # Some offsets are >16 hours, which postgresql will not
+ # (for good reasons) accept
+ if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
+ # Convert it to a UTC timestamp using Python. It will give
+ # us the right time, but the wrong timezone. Should be
+ # enough...
+ dp = datetime.datetime(*dp.utctimetuple()[:6])
+ return dp
+ except Exception as e:
+ raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
+
+ def _maybe_decode(self, s, charset):
+ if isinstance(s, str):
+ return s.strip(' ')
+ return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
+
+ # Workaround for broken quoting in some MUAs (see below)
+ _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
+ def _decode_mime_header(self, hdr, email_workaround):
+ if hdr == None:
+ return None
+
+ # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
+ # we must get rid of the sequence \n\t at least in the header. If we
+ # do this *before* doing any MIME decoding, we should be safe against
+ # anybody *actually* putting that sequence in the header (since we
+ # won't match the encoded contents)
+ hdr = hdr.replace("\n\t"," ")
+
+ # In at least some cases, at least gmail (and possibly other MUAs)
+ # incorrectly put double quotes in the name/email field even when
+ # it's encoded. That's not allowed - they have to be escaped - but
+ # since there's a fair amount of those, we apply a regex to get
+ # rid of them.
+ m = self._re_mailworkaround.search(hdr)
+ if m:
+ hdr = self._re_mailworkaround.sub(r'\1', hdr)
+
+ try:
+ return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
+ except HeaderParseError as e:
+ # Parser error is typically someone specifying an encoding,
+ # but then not actually using that encoding. We'll do the best
+ # we can, which is cut it down to ascii and ignore errors
+ return str(hdr, 'us-ascii', errors='ignore').strip(' ')
+
+ def decode_mime_header(self, hdr, email_workaround=False):
+ try:
+ if isinstance(hdr, Header):
+ hdr = hdr.encode()
+
+ h = self._decode_mime_header(hdr, email_workaround)
+ if h:
+ return h.replace("\0", "")
+ return ''
+ except LookupError as e:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
+ except ValueError as ve:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
+
+ def get_mandatory(self, fieldname):
+ try:
+ x = self.msg[fieldname]
+ if x==None:
+ raise Exception()
+ return x
+ except:
+ raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
+
+ def get_optional(self, fieldname):
+ try:
+ return self.msg[fieldname]
+ except:
+ return ''
+
+ def html_clean(self, html):
+ # First we pass it through tidy
+ (html, errors) = tidylib.tidy_document(html,
+ options={
+ 'drop-proprietary-attributes': 1,
+ 'alt-text': '',
+ 'hide-comments': 1,
+ 'output-xhtml': 1,
+ 'show-body-only': 1,
+ 'clean': 1,
+ 'char-encoding': 'utf8',
+ 'show-warnings': 0,
+ 'show-info': 0,
+ })
+ if errors:
+ print(("HTML tidy failed for %s!" % self.msgid))
+ print(errors)
+ return None
+
+ try:
+ cleaner = HTMLCleaner()
+ cleaner.feed(html)
+ return cleaner.get_text()
+ except Exception as e:
+ # Failed to parse the html, thus failed to clean it. so we must
+ # give up...
+ return None
class HTMLCleaner(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.io = io.StringIO()
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.io = io.StringIO()
- def get_text(self):
- return self.io.getvalue()
+ def get_text(self):
+ return self.io.getvalue()
- def handle_data(self, data):
- self.io.write(data)
+ def handle_data(self, data):
+ self.io.write(data)
- def handle_starttag(self, tag, attrs):
- if tag == "p" or tag == "br":
- self.io.write("\n")
+ def handle_starttag(self, tag, attrs):
+ if tag == "p" or tag == "br":
+ self.io.write("\n")