import re import datetime import dateutil.parser from email.parser import BytesParser from email.header import decode_header, Header from email.errors import HeaderParseError from email.policy import compat32 from html.parser import HTMLParser import tidylib import io from lib.exception import IgnorableException from lib.log import log class ArchivesParser(object): def __init__(self): self.parser = BytesParser(policy=compat32) def parse(self, stream): self.rawtxt = stream.read() self.msg = self.parser.parse(io.BytesIO(self.rawtxt)) def is_msgid(self, msgid): # Look for a specific messageid. This means we might parse it twice, # but so be it. Any exception means we know it's not this one... try: if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid: return True except Exception as e: return False def analyze(self, date_override=None): self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) self._from = self.decode_mime_header(self.get_mandatory('From'), True) self.to = self.decode_mime_header(self.get_optional('To'), True) self.cc = self.decode_mime_header(self.get_optional('CC'), True) self.subject = self.decode_mime_header(self.get_optional('Subject')) if date_override: self.date = self.forgiving_date_decode(date_override) else: self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date'))) self.bodytxt = self.get_body() self.attachments = [] self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) # Build an list of the message id's we are interested in self.parents = [] # The first one is in-reply-to, if it exists if self.get_optional('in-reply-to'): m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True) if m: self.parents.append(m) # Then we add all References values, in backwards order if self.get_optional('references'): cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())] # Can't do this with a simple self.parents.extend() due to broken # mailers that add the same reference more than once. And we can't # use a set() to make it unique, because order is very important for m in cleaned_msgids: if m and m not in self.parents: self.parents.append(m) def clean_charset(self, charset): lcharset = charset.lower() if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': # Special case where we don't know... We'll assume # us-ascii and use replacements return 'us-ascii' if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': # Seriously broken charset definitions, map to us-ascii # and throw away the rest with replacements return 'us-ascii' if lcharset == 'x-gbk': # Some MUAs set it to x-gbk, but there is a valid # declaratoin as gbk... return 'gbk' if lcharset == 'iso-8859-8-i': # -I is a special logical version, but should be the # same charset return 'iso-8859-8' if lcharset == 'windows-874': # This is an alias for iso-8859-11 return 'iso-8859-11' if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': # Strange way of saying 8859.... return 'iso-8859-1' if lcharset == 'iso885915': return 'iso-8859-15' if lcharset == 'iso-latin-2': return 'iso-8859-2' if lcharset == 'iso-850': # Strange spelling of cp850 (windows charset) return 'cp850' if lcharset == 'koi8r': return 'koi8-r' if lcharset == 'cp 1252': return 'cp1252' if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii': # Why did this show up more than once?! return 'iso-8859-1' if lcharset == 'x-windows-949': return 'ms949' if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': # This is a locale, and not a charset, but most likely it's this one return 'iso-8859-1' if lcharset == 'iso-8858-15': # How is this a *common* mistake? return 'iso-8859-15' if lcharset == 'macintosh': return 'mac_roman' if lcharset == 'cn-big5': return 'big5' if lcharset == 'x-unicode-2-0-utf-7': return 'utf-7' if lcharset == 'tscii': # No support for this charset :S Map it down to ascii # and throw away all the rest. sucks, but we have to return 'us-ascii' return charset def get_payload_as_unicode(self, msg): try: b = msg.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore it and hope for a better MIME part later. b = None if b: # Find out if there is a charset charset = None params = msg.get_params() if not params: # No content-type, so we assume us-ascii return str(b, 'us-ascii', errors='ignore') for k, v in params: if k.lower() == 'charset': charset = v break if charset: try: return str(b, self.clean_charset(charset), errors='ignore') except LookupError as e: raise IgnorableException("Failed to get unicode payload: %s" % e) else: # XXX: reasonable default? return str(b, errors='ignore') # Return None or empty string, depending on what we got back return b # Regular expression matching the PostgreSQL custom mail footer that # is appended to all emails. _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL) def get_body(self): b = self._get_body() if b: # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will # later reject.. if b.find('\udbff\n\udef8'): b = b.replace('\udbff\n\udef8', '') # Remove postgres specific mail footer - if it's there m = self._re_footer.match(b) if m: b = m.group(1) # Sometimes we end up with a trailing \0 when decoding long strings, so # replace it if it's there. # In fact, replace it everywhere, since it can also turn up in the middle # of a text when it's a really broken decoding. b = b.replace('\0', '') return b def _get_body(self): # This is where the magic happens - try to figure out what the body # of this message should render as. hasempty = False # First see if this is a single-part message that we can just # decode and go. b = self.get_payload_as_unicode(self.msg) if b: return b if b == '': # We found something, but it was empty. We'll keep looking as # there might be something better available, but make a note # that empty exists. hasempty = True # Ok, it's multipart. Find the first part that is text/plain, # and use that one. Do this recursively, since we may have something # like: # multipart/mixed: # multipart/alternative: # text/plain # text/html # application/octet-stream (attachment) b = self.recursive_first_plaintext(self.msg) if b: return b if b == '': hasempty = True # Couldn't find a plaintext. Look for the first HTML in that case. # Fallback, but what can we do at this point... b = self.recursive_first_plaintext(self.msg, True) if b: b = self.html_clean(b) if b: return b if b == '' or b is None: hasempty = True if hasempty: log.status('Found empty body in %s' % self.msgid) return '' raise IgnorableException("Don't know how to read the body from %s" % self.msgid) def recursive_first_plaintext(self, container, html_instead=False): pl = container.get_payload() if isinstance(pl, str): # This was not a multipart, but it leaked... Give up! return None for p in pl: if p.get_params() is None: # MIME multipart/mixed, but no MIME type on the part log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) return self.get_payload_as_unicode(p) if p.get_params()[0][0].lower() == 'text/plain': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if html_instead and p.get_params()[0][0].lower() == 'text/html': # Don't include it if it looks like an attachment if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if p.is_multipart(): b = self.recursive_first_plaintext(p, html_instead) if b or b == '': return b # Yikes, nothing here! Hopefully we'll find something when # we continue looping at a higher level. return None def get_attachments(self): self.attachments_found_first_plaintext = False self.recursive_get_attachments(self.msg) # Clean a filenames encoding and return it as a unicode string def _clean_filename_encoding(self, filename): # If this is a header-encoded filename, start by decoding that if filename.startswith('=?'): decoded, encoding = decode_header(filename)[0] return str(decoded, encoding, errors='ignore') # If it's already unicode, just return it if isinstance(filename, str): return filename # Anything that's not UTF8, we just get rid of. We can live with # filenames slightly mangled in this case. return str(filename, 'utf-8', errors='ignore') def _extract_filename(self, container): # Try to get the filename for an attachment in the container. # If the standard library can figure one out, use that one. f = container.get_filename() if f: return self._clean_filename_encoding(f) # Failing that, some mailers set Content-Description to the # filename if 'Content-Description' in container: return self._clean_filename_encoding(container['Content-Description']) return None def recursive_get_attachments(self, container): # We start recursion in the "multipart" container if any if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed': # Multipart - worth scanning into if not container.is_multipart(): # Wow, this is broken. It's multipart/mixed, but doesn't # contain multiple parts. # Since we're just looking for attachments, let's just # ignore it... return for p in container.get_payload(): if p.get_params() is None: continue self.recursive_get_attachments(p) elif container.get_content_type() == 'multipart/alternative': # Alternative is not an attachment (we decide) # It's typilcally plantext + html self.attachments_found_first_plaintext = True return elif container.is_multipart(): # Other kinds of multipart, such as multipart/signed... return else: # Not a multipart. # Exclude specific contenttypes if container.get_content_type() == 'application/pgp-signature': return if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'): return # For now, accept anything not text/plain if container.get_content_type() != 'text/plain': try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # It's a text/plain, it might be worthwhile. # If it has a name, we consider it an attachments if not container.get_params(): return for k, v in container.get_params(): if k == 'name' and v != '': # Yes, it has a name try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If it's content-disposition=attachment, we also want to save it if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'): try: self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return return # If we have already found one text/plain part, make all # further text/plain parts attachments if self.attachments_found_first_plaintext: # However, this will also *always* catch the MIME part added # by majordomo with the footer. So if that one is present, # we need to explicitly exclude it again. try: b = container.get_payload(decode=True) except AssertionError: # Badly encoded data can throw an exception here, where the python # libraries fail to handle it and enters a cannot-happen path. # In which case we just ignore this attachment. return if isinstance(b, str) and not self._re_footer.match(b): # We know there is no name for this one self.attachments.append((None, container.get_content_type(), b)) return # Ok, so this was a plaintext that we ignored. Set the flag # that we have now ignored one, so we'll make the next one # an attachment. self.attachments_found_first_plaintext = True # No name, and text/plain, so ignore it re_msgid = re.compile('^\s*<(.*)>\s*') def clean_messageid(self, messageid, ignorebroken=False): m = self.re_msgid.match(messageid) if not m: if ignorebroken: log.status("Could not parse messageid '%s', ignoring it" % messageid) return None raise IgnorableException("Could not parse message id '%s'" % messageid) return m.groups(1)[0].replace(' ', '') # _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') # Now using [^\s] instead of \w, to work with japanese chars _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$') _date_multiminus_re = re.compile(' -(-\d+)$') _date_offsetnoplus_re = re.compile(' (\d{4})$') def forgiving_date_decode(self, d): if d.strip() == '': raise IgnorableException("Failed to parse empty date") # Strange timezones requiring manual adjustments if d.endswith('-7700 (EST)'): d = d.replace('-7700 (EST)', 'EST') if d.endswith('+6700 (EST)'): d = d.replace('+6700 (EST)', 'EST') if d.endswith('+-4-30'): d = d.replace('+-4-30', '+0430') if d.endswith('+1.00'): d = d.replace('+1.00', '+0100') if d.endswith('+-100'): d = d.replace('+-100', '+0100') if d.endswith('+500'): d = d.replace('+500', '+0500') if d.endswith('-500'): d = d.replace('-500', '-0500') if d.endswith('-700'): d = d.replace('-700', '-0700') if d.endswith('-800'): d = d.replace('-800', '-0800') if d.endswith('+05-30'): d = d.replace('+05-30', '+0530') if d.endswith('+0-900'): d = d.replace('+0-900', '-0900') if d.endswith('Mexico/General'): d = d.replace('Mexico/General', 'CDT') if d.endswith('Pacific Daylight Time'): d = d.replace('Pacific Daylight Time', 'PDT') if d.endswith(' ZE2'): d = d.replace(' ZE2', ' +0200') if d.find('-Juin-') > 0: d = d.replace('-Juin-', '-Jun-') if d.find('-Juil-') > 0: d = d.replace('-Juil-', '-Jul-') if d.find(' 0 (GMT)'): d = d.replace(' 0 (GMT)', ' +0000') if self._date_multiminus_re.search(d): d = self._date_multiminus_re.sub(' \\1', d) if self._date_offsetnoplus_re.search(d): d = self._date_offsetnoplus_re.sub('+\\1', d) # We have a number of dates in the format # " +0200 (MET DST)" # or similar. The problem coming from the space within the # parenthesis, or if the contents of the parenthesis is # completely empty if self._date_multi_re.search(d): d = self._date_multi_re.sub('', d) # If the spec is instead # " +0200 (...)" # of any kind, we can just remove what's in the (), because the # parser is just going to rely on the fixed offset anyway. if self._date_multi_re2.search(d): d = self._date_multi_re2.sub(' \\1', d) try: dp = dateutil.parser.parse(d, fuzzy=True) # Some offsets are >16 hours, which postgresql will not # (for good reasons) accept if dp.utcoffset() and abs(dp.utcoffset().days * (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1: # Convert it to a UTC timestamp using Python. It will give # us the right time, but the wrong timezone. Should be # enough... dp = datetime.datetime(*dp.utctimetuple()[:6]) return dp except Exception as e: raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) def _maybe_decode(self, s, charset): if isinstance(s, str): return s.strip(' ') return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ') # Workaround for broken quoting in some MUAs (see below) _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE) def _decode_mime_header(self, hdr, email_workaround): if hdr is None: return None # Per http://bugs.python.org/issue504152 (and lots of testing), it seems # we must get rid of the sequence \n\t at least in the header. If we # do this *before* doing any MIME decoding, we should be safe against # anybody *actually* putting that sequence in the header (since we # won't match the encoded contents) hdr = hdr.replace("\n\t", " ") # In at least some cases, at least gmail (and possibly other MUAs) # incorrectly put double quotes in the name/email field even when # it's encoded. That's not allowed - they have to be escaped - but # since there's a fair amount of those, we apply a regex to get # rid of them. m = self._re_mailworkaround.search(hdr) if m: hdr = self._re_mailworkaround.sub(r'\1', hdr) try: return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)]) except HeaderParseError as e: # Parser error is typically someone specifying an encoding, # but then not actually using that encoding. We'll do the best # we can, which is cut it down to ascii and ignore errors return str(hdr, 'us-ascii', errors='ignore').strip(' ') def decode_mime_header(self, hdr, email_workaround=False): try: if isinstance(hdr, Header): hdr = hdr.encode() h = self._decode_mime_header(hdr, email_workaround) if h: return h.replace("\0", "") return '' except LookupError as e: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) except ValueError as ve: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) def get_mandatory(self, fieldname): try: x = self.msg[fieldname] if x is None: raise Exception() return x except: raise IgnorableException("Mandatory field '%s' is missing" % fieldname) def get_optional(self, fieldname): try: return self.msg[fieldname] except: return '' def html_clean(self, html): # First we pass it through tidy (html, errors) = tidylib.tidy_document(html, options={ 'drop-proprietary-attributes': 1, 'alt-text': '', 'hide-comments': 1, 'output-xhtml': 1, 'show-body-only': 1, 'clean': 1, 'char-encoding': 'utf8', 'show-warnings': 0, 'show-info': 0, }) if errors: print(("HTML tidy failed for %s!" % self.msgid)) print(errors) return None try: cleaner = HTMLCleaner() cleaner.feed(html) return cleaner.get_text() except Exception as e: # Failed to parse the html, thus failed to clean it. so we must # give up... return None class HTMLCleaner(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.io = io.StringIO() def get_text(self): return self.io.getvalue() def handle_data(self, data): self.io.write(data) def handle_starttag(self, tag, attrs): if tag == "p" or tag == "br": self.io.write("\n")