import re import datetime import dateutil.parser from email.parser import Parser from email.header import decode_header from email.errors import HeaderParseError from HTMLParser import HTMLParser, HTMLParseError import tidy import StringIO from lib.exception import IgnorableException from lib.log import log class ArchivesParser(object): def __init__(self): self.parser = Parser() def parse(self, stream): self.msg = self.parser.parse(stream) def analyze(self): self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) self._from = self.decode_mime_header(self.get_mandatory('From')) self.to = self.decode_mime_header(self.get_optional('To')) self.cc = self.decode_mime_header(self.get_optional('CC')) self.subject = self.decode_mime_header(self.get_optional('Subject')) self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date'))) self.bodytxt = self.get_body() self.attachments = [] self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) # Build an list of the message id's we are interested in self.parents = [] # The first one is in-reply-to, if it exists if self.get_optional('in-reply-to'): m = self.clean_messageid(self.get_optional('in-reply-to'), True) if m: self.parents.append(m) # Then we add all References values, in backwards order if self.get_optional('references'): cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.get_optional('references').split())] # Can't do this with a simple self.parents.extend() due to broken # mailers that add the same reference more than once. And we can't # use a set() to make it unique, because order is very important for m in cleaned_msgids: if m and not m in self.parents: self.parents.append(m) def clean_charset(self, charset): lcharset = charset.lower() if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': # Special case where we don't know... We'll assume # us-ascii and use replacements return 'us-ascii' if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': # Seriously broken charset definitions, map to us-ascii # and throw away the rest with replacements return 'us-ascii' if lcharset == 'x-gbk': # Some MUAs set it to x-gbk, but there is a valid # declaratoin as gbk... return 'gbk' if lcharset == 'iso-8859-8-i': # -I is a special logical version, but should be the # same charset return 'iso-8859-8' if lcharset == 'windows-874': # This is an alias for iso-8859-11 return 'iso-8859-11' if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': # Strange way of saying 8859.... return 'iso-8859-1' if lcharset == 'iso885915': return 'iso-8859-15' if lcharset == 'iso-latin-2': return 'iso-8859-2' if lcharset == 'iso-850': # Strange spelling of cp850 (windows charset) return 'cp850' if lcharset == 'koi8r': return 'koi8-r' if lcharset == 'cp 1252': return 'cp1252' if lcharset == 'iso-8859-1,iso-8859-2': # Why did this show up more than once?! return 'iso-8859-1' if lcharset == 'x-windows-949': return 'ms949' if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': # This is a locale, and not a charset, but most likely it's this one return 'iso-8859-1' if lcharset == 'iso-8858-15': # How is this a *common* mistake? return 'iso-8859-15' if lcharset == 'macintosh': return 'mac_roman' if lcharset == 'cn-big5': return 'big5' if lcharset == 'x-unicode-2-0-UTF-7': return 'utf-7' if lcharset == 'tscii': # No support for this charset :S Map it down to ascii # and throw away all the rest. sucks, but we have to return 'us-ascii' return charset def get_payload_as_unicode(self, msg): b = msg.get_payload(decode=True) if b: # Find out if there is a charset charset = None params = msg.get_params() if not params: # No content-type, so we assume us-ascii return unicode(b, 'us-ascii', errors='ignore') for k,v in params: if k.lower() == 'charset': charset = v break if charset: try: return unicode(b, self.clean_charset(charset), errors='ignore') except LookupError, e: raise IgnorableException("Failed to get unicode payload: %s" % e) else: # XXX: reasonable default? return unicode(b, errors='ignore') # Return None or empty string, depending on what we got back return b def get_body(self): b = self._get_body() if b: # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will # later reject.. if b.find(u'\udbff\n\udef8'): b = b.replace(u'\udbff\n\udef8', '') return b def _get_body(self): # This is where the magic happens - try to figure out what the body # of this message should render as. hasempty = False # First see if this is a single-part message that we can just # decode and go. b = self.get_payload_as_unicode(self.msg) if b: return b if b == '': # We found something, but it was empty. We'll keep looking as # there might be something better available, but make a note # that empty exists. hasempty = True # Ok, it's multipart. Find the first part that is text/plain, # and use that one. Do this recursively, since we may have something # like: # multipart/mixed: # multipart/alternative: # text/plain # text/html # application/octet-stream (attachment) b = self.recursive_first_plaintext(self.msg) if b: return b if b == '': hasempty = True # Couldn't find a plaintext. Look for the first HTML in that case. # Fallback, but what can we do at this point... b = self.recursive_first_plaintext(self.msg, True) if b: b = self.html_clean(b) if b: return b if b == '': hasempty = True if hasempty: log.status('Found empty body in %s' % self.msgid) return '' raise IgnorableException("Don't know how to read the body from %s" % self.msgid) def recursive_first_plaintext(self, container, html_instead=False): pl = container.get_payload() if isinstance(pl, str): # This was not a multipart, but it leaked... Give up! return None for p in pl: if p.get_params() == None: # MIME multipart/mixed, but no MIME type on the part log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) return self.get_payload_as_unicode(p) if p.get_params()[0][0].lower() == 'text/plain': # Don't include it if it looks like an attachment if p.has_key('Content-Disposition') and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if html_instead and p.get_params()[0][0].lower() == 'text/html': # Don't include it if it looks like an attachment if p.has_key('Content-Disposition') and p['Content-Disposition'].startswith('attachment'): continue return self.get_payload_as_unicode(p) if p.is_multipart(): b = self.recursive_first_plaintext(p, html_instead) if b or b == '': return b # Yikes, nothing here! Hopefully we'll find something when # we continue looping at a higher level. return None def get_attachments(self): self.recursive_get_attachments(self.msg) def _clean_filename_encoding(self, filename): # Clean a filenames encoding and return it as a unicode string # If it's already unicode, just return it if isinstance(filename, unicode): return filename # Anything that's not UTF8, we just get rid of. We can live with # filenames slightly mangled in this case. return unicode(filename, 'utf-8', errors='ignore') def _extract_filename(self, container): # Try to get the filename for an attachment in the container. # If the standard library can figure one out, use that one. f = container.get_filename() if f: return self._clean_filename_encoding(f) # Failing that, some mailers set Content-Description to the # filename if container.has_key('Content-Description'): return self._clean_filename_encoding(container['Content-Description']) return None def recursive_get_attachments(self, container): if container.get_content_type() == 'multipart/mixed': # Multipart - worth scanning into if not container.is_multipart(): # Wow, this is broken. It's multipart/mixed, but doesn't # contain multiple parts. # Since we're just looking for attachments, let's just # ignore it... return for p in container.get_payload(): if p.get_params() == None: continue self.recursive_get_attachments(p) elif container.get_content_type() == 'multipart/alternative': # Alternative is not an attachment (we decide) # It's typilcally plantext + html return elif container.is_multipart(): # Other kinds of multipart, such as multipart/signed... return else: # Not a multipart. # Exclude specific contenttypes if container.get_content_type() == 'application/pgp-signature': return # For now, accept anything not text/plain if container.get_content_type() != 'text/plain': self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) return # It's a text/plain, it might be worthwhile. # If it has a name, we consider it an attachments if not container.get_params(): return for k,v in container.get_params(): if k=='name' and v != '': # Yes, it has a name self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) return # No name, and text/plain, so ignore it re_msgid = re.compile('^\s*<(.*)>\s*') def clean_messageid(self, messageid, ignorebroken=False): m = self.re_msgid.match(messageid) if not m: if ignorebroken: log.status("Could not parse messageid '%s', ignoring it" % messageid) return None raise IgnorableException("Could not parse message id '%s'" % messageid) return m.groups(1)[0] # _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') # Now using [^\s] instead of \w, to work with japanese chars _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$') _date_multiminus_re = re.compile(' -(-\d+)$') def forgiving_date_decode(self, d): # Strange timezones requiring manual adjustments if d.endswith('-7700 (EST)'): d = d.replace('-7700 (EST)', 'EST') if d.endswith('+6700 (EST)'): d = d.replace('+6700 (EST)', 'EST') if d.endswith('+-4-30'): d = d.replace('+-4-30', '+0430') if d.endswith('+1.00'): d = d.replace('+1.00', '+0100') if d.endswith('+-100'): d = d.replace('+-100', '+0100') if d.endswith('+500'): d = d.replace('+500', '+0500') if d.endswith('-500'): d = d.replace('-500', '-0500') if d.endswith('-700'): d = d.replace('-700', '-0700') if d.endswith('-800'): d = d.replace('-800', '-0800') if d.endswith('+05-30'): d = d.replace('+05-30', '+0530') if d.endswith('Mexico/General'): d = d.replace('Mexico/General','CDT') if d.endswith('Pacific Daylight Time'): d = d.replace('Pacific Daylight Time', 'PDT') if d.endswith(' ZE2'): d = d.replace(' ZE2',' +0200') if d.find('-Juin-') > 0: d = d.replace('-Juin-','-Jun-') if d.find('-Juil-') > 0: d = d.replace('-Juil-','-Jul-') if self._date_multiminus_re.search(d): d = self._date_multiminus_re.sub(' \\1', d) # We have a number of dates in the format # " +0200 (MET DST)" # or similar. The problem coming from the space within the # parenthesis, or if the contents of the parenthesis is # completely empty if self._date_multi_re.search(d): d = self._date_multi_re.sub('', d) # If the spec is instead # " +0200 (...)" # of any kind, we can just remove what's in the (), because the # parser is just going to rely on the fixed offset anyway. if self._date_multi_re2.search(d): d = self._date_multi_re2.sub(' \\1', d) try: dp = dateutil.parser.parse(d, fuzzy=True) # Some offsets are >16 hours, which postgresql will not # (for good reasons) accept if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1: # Convert it to a UTC timestamp using Python. It will give # us the right time, but the wrong timezone. Should be # enough... dp = datetime.datetime(*dp.utctimetuple()[:6]) return dp except Exception, e: raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) def _decode_mime_header(self, hdr): if hdr == None: return None # Per http://bugs.python.org/issue504152 (and lots of testing), it seems # we must get rid of the sequence \n\t at least in the header. If we # do this *before* doing any MIME decoding, we should be safe against # anybody *actually* putting that sequence in the header (since we # won't match the encoded contents) hdr = hdr.replace("\n\t","") try: return " ".join([unicode(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore') for s,charset in decode_header(hdr)]) except HeaderParseError, e: # Parser error is typically someone specifying an encoding, # but then not actually using that encoding. We'll do the best # we can, which is cut it down to ascii and ignore errors return unicode(hdr, 'us-ascii', errors='ignore') def decode_mime_header(self, hdr): try: return self._decode_mime_header(hdr) except LookupError, e: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) except ValueError, ve: raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) def get_mandatory(self, fieldname): try: x = self.msg[fieldname] if x==None: raise Exception() return x except: raise IgnorableException("Mandatory field '%s' is missing" % fieldname) def get_optional(self, fieldname): try: return self.msg[fieldname] except: return None def html_clean(self, html): # First we pass it through tidy html = unicode(str(tidy.parseString(html.encode('utf8'), drop_proprietary_attributes=1, alt_text='',hide_comments=1,output_xhtml=1,show_body_only=1,clean=1,char_encoding='utf8')), 'utf8') try: cleaner = HTMLCleaner() cleaner.feed(html) return cleaner.get_text() except HTMLParseError, e: # Failed to parse the html, thus failed to clean it. so we must # give up... return None class HTMLCleaner(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.io = StringIO.StringIO() def get_text(self): return self.io.getvalue() def handle_data(self, data): self.io.write(data) def handle_starttag(self, tag, attrs): if tag == "p" or tag == "br": self.io.write("\n")