diff options
author | Magnus Hagander | 2019-01-03 20:15:38 +0000 |
---|---|---|
committer | Magnus Hagander | 2019-01-04 11:24:06 +0000 |
commit | 3fb227230c145c828888aa2e7c5d8b9a8c0760a0 (patch) | |
tree | 2eeba42da158e6e00c06d537dc6da3a895e39b6c /loader/lib | |
parent | 1e173c362aa105ab4397fb77f8c693a1e01efa11 (diff) |
Tabs to 4 spaces
pep8 standard for indentation
Diffstat (limited to 'loader/lib')
-rw-r--r-- | loader/lib/exception.py | 2 | ||||
-rw-r--r-- | loader/lib/log.py | 44 | ||||
-rw-r--r-- | loader/lib/mbox.py | 68 | ||||
-rw-r--r-- | loader/lib/parser.py | 1102 | ||||
-rw-r--r-- | loader/lib/storage.py | 556 | ||||
-rw-r--r-- | loader/lib/varnish.py | 48 |
6 files changed, 910 insertions, 910 deletions
diff --git a/loader/lib/exception.py b/loader/lib/exception.py index 02172a2..54a9efc 100644 --- a/loader/lib/exception.py +++ b/loader/lib/exception.py @@ -1,2 +1,2 @@ class IgnorableException(Exception): - pass + pass diff --git a/loader/lib/log.py b/loader/lib/log.py index 5b6379a..48722c9 100644 --- a/loader/lib/log.py +++ b/loader/lib/log.py @@ -1,33 +1,33 @@ class Log(object): - def __init__(self): - self.verbose = False + def __init__(self): + self.verbose = False - def set(self, verbose): - self.verbose = verbose + def set(self, verbose): + self.verbose = verbose - def status(self, msg): - if self.verbose: - print(msg) + def status(self, msg): + if self.verbose: + print(msg) - def log(self, msg): - print(msg) + def log(self, msg): + print(msg) - def error(self, msg): - print(msg) + def error(self, msg): + print(msg) - def print_status(self): - opstatus.print_status() + def print_status(self): + opstatus.print_status() class OpStatus(object): - def __init__(self): - self.stored = 0 - self.dupes = 0 - self.tagged = 0 - self.failed = 0 - self.overwritten = 0 - - def print_status(self): - print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten)) + def __init__(self): + self.stored = 0 + self.dupes = 0 + self.tagged = 0 + self.failed = 0 + self.overwritten = 0 + + def print_status(self): + print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten)) log = Log() diff --git a/loader/lib/mbox.py b/loader/lib/mbox.py index 77c83b0..c097e72 100644 --- a/loader/lib/mbox.py +++ b/loader/lib/mbox.py @@ -11,41 +11,41 @@ SEPARATOR = "ABCARCHBREAK123" * 50 bSEPARATOR = bytes(SEPARATOR, 'ascii') class MailboxBreakupParser(object): - def __init__(self, fn): - self.EOF = False + def __init__(self, fn): + self.EOF = False - if fn.endswith(".gz"): - cat = "zcat" - else: - cat = "cat" - cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR) - self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) + if fn.endswith(".gz"): + cat = "zcat" + else: + cat = "cat" + cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR) + self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) - def returncode(self): - self.pipe.wait() - return self.pipe.returncode + def returncode(self): + self.pipe.wait() + return self.pipe.returncode - def stderr_output(self): - return self.pipe.stderr.read() + def stderr_output(self): + return self.pipe.stderr.read() - def __next__(self): - sio = BytesIO() - while True: - try: - l = next(self.pipe.stdout) - except StopIteration: - # End of file! - self.EOF = True - if sio.tell() == 0: - # Nothing read yet, so return None instead of an empty - # bytesio - return None - sio.seek(0) - return sio - if l.rstrip() == bSEPARATOR: - # Reached a separator. Meaning we're not at end of file, - # but we're at end of message. - sio.seek(0) - return sio - # Otherwise, append it to where we are now - sio.write(l) + def __next__(self): + sio = BytesIO() + while True: + try: + l = next(self.pipe.stdout) + except StopIteration: + # End of file! + self.EOF = True + if sio.tell() == 0: + # Nothing read yet, so return None instead of an empty + # bytesio + return None + sio.seek(0) + return sio + if l.rstrip() == bSEPARATOR: + # Reached a separator. Meaning we're not at end of file, + # but we're at end of message. + sio.seek(0) + return sio + # Otherwise, append it to where we are now + sio.write(l) diff --git a/loader/lib/parser.py b/loader/lib/parser.py index b97c8b3..81192da 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -14,560 +14,560 @@ from lib.exception import IgnorableException from lib.log import log class ArchivesParser(object): - def __init__(self): - self.parser = BytesParser(policy=compat32) - - def parse(self, stream): - self.rawtxt = stream.read() - self.msg = self.parser.parse(io.BytesIO(self.rawtxt)) - - def is_msgid(self, msgid): - # Look for a specific messageid. This means we might parse it twice, - # but so be it. Any exception means we know it's not this one... - try: - if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid: - return True - except Exception as e: - return False - - def analyze(self, date_override=None): - self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) - self._from = self.decode_mime_header(self.get_mandatory('From'), True) - self.to = self.decode_mime_header(self.get_optional('To'), True) - self.cc = self.decode_mime_header(self.get_optional('CC'), True) - self.subject = self.decode_mime_header(self.get_optional('Subject')) - if date_override: - self.date = self.forgiving_date_decode(date_override) - else: - self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date'))) - self.bodytxt = self.get_body() - self.attachments = [] - self.get_attachments() - if len(self.attachments) > 0: - log.status("Found %s attachments" % len(self.attachments)) - - # Build an list of the message id's we are interested in - self.parents = [] - # The first one is in-reply-to, if it exists - if self.get_optional('in-reply-to'): - m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True) - if m: - self.parents.append(m) - - # Then we add all References values, in backwards order - if self.get_optional('references'): - cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())] - # Can't do this with a simple self.parents.extend() due to broken - # mailers that add the same reference more than once. And we can't - # use a set() to make it unique, because order is very important - for m in cleaned_msgids: - if m and not m in self.parents: - self.parents.append(m) - - - def clean_charset(self, charset): - lcharset = charset.lower() - if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': - # Special case where we don't know... We'll assume - # us-ascii and use replacements - return 'us-ascii' - if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': - # Seriously broken charset definitions, map to us-ascii - # and throw away the rest with replacements - return 'us-ascii' - if lcharset == 'x-gbk': - # Some MUAs set it to x-gbk, but there is a valid - # declaratoin as gbk... - return 'gbk' - if lcharset == 'iso-8859-8-i': - # -I is a special logical version, but should be the - # same charset - return 'iso-8859-8' - if lcharset == 'windows-874': - # This is an alias for iso-8859-11 - return 'iso-8859-11' - if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': - # Strange way of saying 8859.... - return 'iso-8859-1' - if lcharset == 'iso885915': - return 'iso-8859-15' - if lcharset == 'iso-latin-2': - return 'iso-8859-2' - if lcharset == 'iso-850': - # Strange spelling of cp850 (windows charset) - return 'cp850' - if lcharset == 'koi8r': - return 'koi8-r' - if lcharset == 'cp 1252': - return 'cp1252' - if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii': - # Why did this show up more than once?! - return 'iso-8859-1' - if lcharset == 'x-windows-949': - return 'ms949' - if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': - # This is a locale, and not a charset, but most likely it's this one - return 'iso-8859-1' - if lcharset == 'iso-8858-15': - # How is this a *common* mistake? - return 'iso-8859-15' - if lcharset == 'macintosh': - return 'mac_roman' - if lcharset == 'cn-big5': - return 'big5' - if lcharset == 'x-unicode-2-0-utf-7': - return 'utf-7' - if lcharset == 'tscii': - # No support for this charset :S Map it down to ascii - # and throw away all the rest. sucks, but we have to - return 'us-ascii' - return charset - - def get_payload_as_unicode(self, msg): - try: - b = msg.get_payload(decode=True) - except AssertionError: - # Badly encoded data can throw an exception here, where the python - # libraries fail to handle it and enters a cannot-happen path. - # In which case we just ignore it and hope for a better MIME part later. - b = None - - if b: - # Find out if there is a charset - charset = None - params = msg.get_params() - if not params: - # No content-type, so we assume us-ascii - return str(b, 'us-ascii', errors='ignore') - for k,v in params: - if k.lower() == 'charset': - charset = v - break - if charset: - try: - return str(b, self.clean_charset(charset), errors='ignore') - except LookupError as e: - raise IgnorableException("Failed to get unicode payload: %s" % e) - else: - # XXX: reasonable default? - return str(b, errors='ignore') - # Return None or empty string, depending on what we got back - return b - - # Regular expression matching the PostgreSQL custom mail footer that - # is appended to all emails. - _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL) - def get_body(self): - b = self._get_body() - if b: - # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will - # later reject.. - if b.find('\udbff\n\udef8'): - b = b.replace('\udbff\n\udef8', '') - - # Remove postgres specific mail footer - if it's there - m = self._re_footer.match(b) - if m: - b = m.group(1) - - # Sometimes we end up with a trailing \0 when decoding long strings, so - # replace it if it's there. - # In fact, replace it everywhere, since it can also turn up in the middle - # of a text when it's a really broken decoding. - b = b.replace('\0', '') - - return b - - def _get_body(self): - # This is where the magic happens - try to figure out what the body - # of this message should render as. - hasempty = False - - # First see if this is a single-part message that we can just - # decode and go. - b = self.get_payload_as_unicode(self.msg) - if b: return b - if b == '': - # We found something, but it was empty. We'll keep looking as - # there might be something better available, but make a note - # that empty exists. - hasempty = True - - # Ok, it's multipart. Find the first part that is text/plain, - # and use that one. Do this recursively, since we may have something - # like: - # multipart/mixed: - # multipart/alternative: - # text/plain - # text/html - # application/octet-stream (attachment) - b = self.recursive_first_plaintext(self.msg) - if b: return b - if b == '': - hasempty = True - - # Couldn't find a plaintext. Look for the first HTML in that case. - # Fallback, but what can we do at this point... - b = self.recursive_first_plaintext(self.msg, True) - if b: - b = self.html_clean(b) - if b: return b - if b == '' or b is None: - hasempty = True - - if hasempty: - log.status('Found empty body in %s' % self.msgid) - return '' - raise IgnorableException("Don't know how to read the body from %s" % self.msgid) - - def recursive_first_plaintext(self, container, html_instead=False): - pl = container.get_payload() - if isinstance(pl, str): - # This was not a multipart, but it leaked... Give up! - return None - for p in pl: - if p.get_params() == None: - # MIME multipart/mixed, but no MIME type on the part - log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) - return self.get_payload_as_unicode(p) - if p.get_params()[0][0].lower() == 'text/plain': - # Don't include it if it looks like an attachment - if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): - continue - return self.get_payload_as_unicode(p) - if html_instead and p.get_params()[0][0].lower() == 'text/html': - # Don't include it if it looks like an attachment - if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): - continue - return self.get_payload_as_unicode(p) - if p.is_multipart(): - b = self.recursive_first_plaintext(p, html_instead) - if b or b == '': return b - - # Yikes, nothing here! Hopefully we'll find something when - # we continue looping at a higher level. - return None - - def get_attachments(self): - self.attachments_found_first_plaintext = False - self.recursive_get_attachments(self.msg) - - # Clean a filenames encoding and return it as a unicode string - def _clean_filename_encoding(self, filename): - # If this is a header-encoded filename, start by decoding that - if filename.startswith('=?'): - decoded, encoding = decode_header(filename)[0] - return str(decoded, encoding, errors='ignore') - - # If it's already unicode, just return it - if isinstance(filename, str): - return filename - - # Anything that's not UTF8, we just get rid of. We can live with - # filenames slightly mangled in this case. - return str(filename, 'utf-8', errors='ignore') - - def _extract_filename(self, container): - # Try to get the filename for an attachment in the container. - # If the standard library can figure one out, use that one. - f = container.get_filename() - if f: return self._clean_filename_encoding(f) - - # Failing that, some mailers set Content-Description to the - # filename - if 'Content-Description' in container: - return self._clean_filename_encoding(container['Content-Description']) - return None - - def recursive_get_attachments(self, container): - # We start recursion in the "multipart" container if any - if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed': - # Multipart - worth scanning into - if not container.is_multipart(): - # Wow, this is broken. It's multipart/mixed, but doesn't - # contain multiple parts. - # Since we're just looking for attachments, let's just - # ignore it... - return - for p in container.get_payload(): - if p.get_params() == None: - continue - self.recursive_get_attachments(p) - elif container.get_content_type() == 'multipart/alternative': - # Alternative is not an attachment (we decide) - # It's typilcally plantext + html - self.attachments_found_first_plaintext = True - return - elif container.is_multipart(): - # Other kinds of multipart, such as multipart/signed... - return - else: - # Not a multipart. - # Exclude specific contenttypes - if container.get_content_type() == 'application/pgp-signature': - return - if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'): - return - # For now, accept anything not text/plain - if container.get_content_type() != 'text/plain': - try: - self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) - except AssertionError: - # Badly encoded data can throw an exception here, where the python - # libraries fail to handle it and enters a cannot-happen path. - # In which case we just ignore this attachment. - return - return - - # It's a text/plain, it might be worthwhile. - # If it has a name, we consider it an attachments - if not container.get_params(): - return - for k,v in container.get_params(): - if k=='name' and v != '': - # Yes, it has a name - try: - self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) - except AssertionError: - # Badly encoded data can throw an exception here, where the python - # libraries fail to handle it and enters a cannot-happen path. - # In which case we just ignore this attachment. - return - - return - - # If it's content-disposition=attachment, we also want to save it - if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'): - try: - self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) - except AssertionError: - # Badly encoded data can throw an exception here, where the python - # libraries fail to handle it and enters a cannot-happen path. - # In which case we just ignore this attachment. - return - - return - - # If we have already found one text/plain part, make all - # further text/plain parts attachments - if self.attachments_found_first_plaintext: - # However, this will also *always* catch the MIME part added - # by majordomo with the footer. So if that one is present, - # we need to explicitly exclude it again. - try: - b = container.get_payload(decode=True) - except AssertionError: - # Badly encoded data can throw an exception here, where the python - # libraries fail to handle it and enters a cannot-happen path. - # In which case we just ignore this attachment. - return - - if isinstance(b, str) and not self._re_footer.match(b): - # We know there is no name for this one - self.attachments.append((None, container.get_content_type(), b)) - return - - # Ok, so this was a plaintext that we ignored. Set the flag - # that we have now ignored one, so we'll make the next one - # an attachment. - self.attachments_found_first_plaintext = True - # No name, and text/plain, so ignore it - - re_msgid = re.compile('^\s*<(.*)>\s*') - def clean_messageid(self, messageid, ignorebroken=False): - m = self.re_msgid.match(messageid) - if not m: - if ignorebroken: - log.status("Could not parse messageid '%s', ignoring it" % messageid) - return None - raise IgnorableException("Could not parse message id '%s'" % messageid) - return m.groups(1)[0].replace(' ','') - -# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') - # Now using [^\s] instead of \w, to work with japanese chars - _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') - _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$') - _date_multiminus_re = re.compile(' -(-\d+)$') - _date_offsetnoplus_re = re.compile(' (\d{4})$') - def forgiving_date_decode(self, d): - if d.strip() == '': - raise IgnorableException("Failed to parse empty date") - # Strange timezones requiring manual adjustments - if d.endswith('-7700 (EST)'): - d = d.replace('-7700 (EST)', 'EST') - if d.endswith('+6700 (EST)'): - d = d.replace('+6700 (EST)', 'EST') - if d.endswith('+-4-30'): - d = d.replace('+-4-30', '+0430') - if d.endswith('+1.00'): - d = d.replace('+1.00', '+0100') - if d.endswith('+-100'): - d = d.replace('+-100', '+0100') - if d.endswith('+500'): - d = d.replace('+500', '+0500') - if d.endswith('-500'): - d = d.replace('-500', '-0500') - if d.endswith('-700'): - d = d.replace('-700', '-0700') - if d.endswith('-800'): - d = d.replace('-800', '-0800') - if d.endswith('+05-30'): - d = d.replace('+05-30', '+0530') - if d.endswith('+0-900'): - d = d.replace('+0-900', '-0900') - if d.endswith('Mexico/General'): - d = d.replace('Mexico/General','CDT') - if d.endswith('Pacific Daylight Time'): - d = d.replace('Pacific Daylight Time', 'PDT') - if d.endswith(' ZE2'): - d = d.replace(' ZE2',' +0200') - if d.find('-Juin-') > 0: - d = d.replace('-Juin-','-Jun-') - if d.find('-Juil-') > 0: - d = d.replace('-Juil-','-Jul-') - if d.find(' 0 (GMT)'): - d = d.replace(' 0 (GMT)',' +0000') - - if self._date_multiminus_re.search(d): - d = self._date_multiminus_re.sub(' \\1', d) - - if self._date_offsetnoplus_re.search(d): - d = self._date_offsetnoplus_re.sub('+\\1', d) - - - # We have a number of dates in the format - # "<full datespace> +0200 (MET DST)" - # or similar. The problem coming from the space within the - # parenthesis, or if the contents of the parenthesis is - # completely empty - if self._date_multi_re.search(d): - d = self._date_multi_re.sub('', d) - - # If the spec is instead - # "<full datespace> +0200 (...)" - # of any kind, we can just remove what's in the (), because the - # parser is just going to rely on the fixed offset anyway. - if self._date_multi_re2.search(d): - d = self._date_multi_re2.sub(' \\1', d) - - try: - dp = dateutil.parser.parse(d, fuzzy=True) - - # Some offsets are >16 hours, which postgresql will not - # (for good reasons) accept - if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1: - # Convert it to a UTC timestamp using Python. It will give - # us the right time, but the wrong timezone. Should be - # enough... - dp = datetime.datetime(*dp.utctimetuple()[:6]) - return dp - except Exception as e: - raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) - - def _maybe_decode(self, s, charset): - if isinstance(s, str): - return s.strip(' ') - return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ') - - # Workaround for broken quoting in some MUAs (see below) - _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE) - def _decode_mime_header(self, hdr, email_workaround): - if hdr == None: - return None - - # Per http://bugs.python.org/issue504152 (and lots of testing), it seems - # we must get rid of the sequence \n\t at least in the header. If we - # do this *before* doing any MIME decoding, we should be safe against - # anybody *actually* putting that sequence in the header (since we - # won't match the encoded contents) - hdr = hdr.replace("\n\t"," ") - - # In at least some cases, at least gmail (and possibly other MUAs) - # incorrectly put double quotes in the name/email field even when - # it's encoded. That's not allowed - they have to be escaped - but - # since there's a fair amount of those, we apply a regex to get - # rid of them. - m = self._re_mailworkaround.search(hdr) - if m: - hdr = self._re_mailworkaround.sub(r'\1', hdr) - - try: - return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)]) - except HeaderParseError as e: - # Parser error is typically someone specifying an encoding, - # but then not actually using that encoding. We'll do the best - # we can, which is cut it down to ascii and ignore errors - return str(hdr, 'us-ascii', errors='ignore').strip(' ') - - def decode_mime_header(self, hdr, email_workaround=False): - try: - if isinstance(hdr, Header): - hdr = hdr.encode() - - h = self._decode_mime_header(hdr, email_workaround) - if h: - return h.replace("\0", "") - return '' - except LookupError as e: - raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) - except ValueError as ve: - raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) - - def get_mandatory(self, fieldname): - try: - x = self.msg[fieldname] - if x==None: - raise Exception() - return x - except: - raise IgnorableException("Mandatory field '%s' is missing" % fieldname) - - def get_optional(self, fieldname): - try: - return self.msg[fieldname] - except: - return '' - - def html_clean(self, html): - # First we pass it through tidy - (html, errors) = tidylib.tidy_document(html, - options={ - 'drop-proprietary-attributes': 1, - 'alt-text': '', - 'hide-comments': 1, - 'output-xhtml': 1, - 'show-body-only': 1, - 'clean': 1, - 'char-encoding': 'utf8', - 'show-warnings': 0, - 'show-info': 0, - }) - if errors: - print(("HTML tidy failed for %s!" % self.msgid)) - print(errors) - return None - - try: - cleaner = HTMLCleaner() - cleaner.feed(html) - return cleaner.get_text() - except Exception as e: - # Failed to parse the html, thus failed to clean it. so we must - # give up... - return None + def __init__(self): + self.parser = BytesParser(policy=compat32) + + def parse(self, stream): + self.rawtxt = stream.read() + self.msg = self.parser.parse(io.BytesIO(self.rawtxt)) + + def is_msgid(self, msgid): + # Look for a specific messageid. This means we might parse it twice, + # but so be it. Any exception means we know it's not this one... + try: + if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid: + return True + except Exception as e: + return False + + def analyze(self, date_override=None): + self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) + self._from = self.decode_mime_header(self.get_mandatory('From'), True) + self.to = self.decode_mime_header(self.get_optional('To'), True) + self.cc = self.decode_mime_header(self.get_optional('CC'), True) + self.subject = self.decode_mime_header(self.get_optional('Subject')) + if date_override: + self.date = self.forgiving_date_decode(date_override) + else: + self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date'))) + self.bodytxt = self.get_body() + self.attachments = [] + self.get_attachments() + if len(self.attachments) > 0: + log.status("Found %s attachments" % len(self.attachments)) + + # Build an list of the message id's we are interested in + self.parents = [] + # The first one is in-reply-to, if it exists + if self.get_optional('in-reply-to'): + m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True) + if m: + self.parents.append(m) + + # Then we add all References values, in backwards order + if self.get_optional('references'): + cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())] + # Can't do this with a simple self.parents.extend() due to broken + # mailers that add the same reference more than once. And we can't + # use a set() to make it unique, because order is very important + for m in cleaned_msgids: + if m and not m in self.parents: + self.parents.append(m) + + + def clean_charset(self, charset): + lcharset = charset.lower() + if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown': + # Special case where we don't know... We'll assume + # us-ascii and use replacements + return 'us-ascii' + if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset': + # Seriously broken charset definitions, map to us-ascii + # and throw away the rest with replacements + return 'us-ascii' + if lcharset == 'x-gbk': + # Some MUAs set it to x-gbk, but there is a valid + # declaratoin as gbk... + return 'gbk' + if lcharset == 'iso-8859-8-i': + # -I is a special logical version, but should be the + # same charset + return 'iso-8859-8' + if lcharset == 'windows-874': + # This is an alias for iso-8859-11 + return 'iso-8859-11' + if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1': + # Strange way of saying 8859.... + return 'iso-8859-1' + if lcharset == 'iso885915': + return 'iso-8859-15' + if lcharset == 'iso-latin-2': + return 'iso-8859-2' + if lcharset == 'iso-850': + # Strange spelling of cp850 (windows charset) + return 'cp850' + if lcharset == 'koi8r': + return 'koi8-r' + if lcharset == 'cp 1252': + return 'cp1252' + if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii': + # Why did this show up more than once?! + return 'iso-8859-1' + if lcharset == 'x-windows-949': + return 'ms949' + if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de': + # This is a locale, and not a charset, but most likely it's this one + return 'iso-8859-1' + if lcharset == 'iso-8858-15': + # How is this a *common* mistake? + return 'iso-8859-15' + if lcharset == 'macintosh': + return 'mac_roman' + if lcharset == 'cn-big5': + return 'big5' + if lcharset == 'x-unicode-2-0-utf-7': + return 'utf-7' + if lcharset == 'tscii': + # No support for this charset :S Map it down to ascii + # and throw away all the rest. sucks, but we have to + return 'us-ascii' + return charset + + def get_payload_as_unicode(self, msg): + try: + b = msg.get_payload(decode=True) + except AssertionError: + # Badly encoded data can throw an exception here, where the python + # libraries fail to handle it and enters a cannot-happen path. + # In which case we just ignore it and hope for a better MIME part later. + b = None + + if b: + # Find out if there is a charset + charset = None + params = msg.get_params() + if not params: + # No content-type, so we assume us-ascii + return str(b, 'us-ascii', errors='ignore') + for k,v in params: + if k.lower() == 'charset': + charset = v + break + if charset: + try: + return str(b, self.clean_charset(charset), errors='ignore') + except LookupError as e: + raise IgnorableException("Failed to get unicode payload: %s" % e) + else: + # XXX: reasonable default? + return str(b, errors='ignore') + # Return None or empty string, depending on what we got back + return b + + # Regular expression matching the PostgreSQL custom mail footer that + # is appended to all emails. + _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL) + def get_body(self): + b = self._get_body() + if b: + # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will + # later reject.. + if b.find('\udbff\n\udef8'): + b = b.replace('\udbff\n\udef8', '') + + # Remove postgres specific mail footer - if it's there + m = self._re_footer.match(b) + if m: + b = m.group(1) + + # Sometimes we end up with a trailing \0 when decoding long strings, so + # replace it if it's there. + # In fact, replace it everywhere, since it can also turn up in the middle + # of a text when it's a really broken decoding. + b = b.replace('\0', '') + + return b + + def _get_body(self): + # This is where the magic happens - try to figure out what the body + # of this message should render as. + hasempty = False + + # First see if this is a single-part message that we can just + # decode and go. + b = self.get_payload_as_unicode(self.msg) + if b: return b + if b == '': + # We found something, but it was empty. We'll keep looking as + # there might be something better available, but make a note + # that empty exists. + hasempty = True + + # Ok, it's multipart. Find the first part that is text/plain, + # and use that one. Do this recursively, since we may have something + # like: + # multipart/mixed: + # multipart/alternative: + # text/plain + # text/html + # application/octet-stream (attachment) + b = self.recursive_first_plaintext(self.msg) + if b: return b + if b == '': + hasempty = True + + # Couldn't find a plaintext. Look for the first HTML in that case. + # Fallback, but what can we do at this point... + b = self.recursive_first_plaintext(self.msg, True) + if b: + b = self.html_clean(b) + if b: return b + if b == '' or b is None: + hasempty = True + + if hasempty: + log.status('Found empty body in %s' % self.msgid) + return '' + raise IgnorableException("Don't know how to read the body from %s" % self.msgid) + + def recursive_first_plaintext(self, container, html_instead=False): + pl = container.get_payload() + if isinstance(pl, str): + # This was not a multipart, but it leaked... Give up! + return None + for p in pl: + if p.get_params() == None: + # MIME multipart/mixed, but no MIME type on the part + log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid) + return self.get_payload_as_unicode(p) + if p.get_params()[0][0].lower() == 'text/plain': + # Don't include it if it looks like an attachment + if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): + continue + return self.get_payload_as_unicode(p) + if html_instead and p.get_params()[0][0].lower() == 'text/html': + # Don't include it if it looks like an attachment + if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'): + continue + return self.get_payload_as_unicode(p) + if p.is_multipart(): + b = self.recursive_first_plaintext(p, html_instead) + if b or b == '': return b + + # Yikes, nothing here! Hopefully we'll find something when + # we continue looping at a higher level. + return None + + def get_attachments(self): + self.attachments_found_first_plaintext = False + self.recursive_get_attachments(self.msg) + + # Clean a filenames encoding and return it as a unicode string + def _clean_filename_encoding(self, filename): + # If this is a header-encoded filename, start by decoding that + if filename.startswith('=?'): + decoded, encoding = decode_header(filename)[0] + return str(decoded, encoding, errors='ignore') + + # If it's already unicode, just return it + if isinstance(filename, str): + return filename + + # Anything that's not UTF8, we just get rid of. We can live with + # filenames slightly mangled in this case. + return str(filename, 'utf-8', errors='ignore') + + def _extract_filename(self, container): + # Try to get the filename for an attachment in the container. + # If the standard library can figure one out, use that one. + f = container.get_filename() + if f: return self._clean_filename_encoding(f) + + # Failing that, some mailers set Content-Description to the + # filename + if 'Content-Description' in container: + return self._clean_filename_encoding(container['Content-Description']) + return None + + def recursive_get_attachments(self, container): + # We start recursion in the "multipart" container if any + if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed': + # Multipart - worth scanning into + if not container.is_multipart(): + # Wow, this is broken. It's multipart/mixed, but doesn't + # contain multiple parts. + # Since we're just looking for attachments, let's just + # ignore it... + return + for p in container.get_payload(): + if p.get_params() == None: + continue + self.recursive_get_attachments(p) + elif container.get_content_type() == 'multipart/alternative': + # Alternative is not an attachment (we decide) + # It's typilcally plantext + html + self.attachments_found_first_plaintext = True + return + elif container.is_multipart(): + # Other kinds of multipart, such as multipart/signed... + return + else: + # Not a multipart. + # Exclude specific contenttypes + if container.get_content_type() == 'application/pgp-signature': + return + if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'): + return + # For now, accept anything not text/plain + if container.get_content_type() != 'text/plain': + try: + self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) + except AssertionError: + # Badly encoded data can throw an exception here, where the python + # libraries fail to handle it and enters a cannot-happen path. + # In which case we just ignore this attachment. + return + return + + # It's a text/plain, it might be worthwhile. + # If it has a name, we consider it an attachments + if not container.get_params(): + return + for k,v in container.get_params(): + if k=='name' and v != '': + # Yes, it has a name + try: + self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) + except AssertionError: + # Badly encoded data can throw an exception here, where the python + # libraries fail to handle it and enters a cannot-happen path. + # In which case we just ignore this attachment. + return + + return + + # If it's content-disposition=attachment, we also want to save it + if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'): + try: + self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True))) + except AssertionError: + # Badly encoded data can throw an exception here, where the python + # libraries fail to handle it and enters a cannot-happen path. + # In which case we just ignore this attachment. + return + + return + + # If we have already found one text/plain part, make all + # further text/plain parts attachments + if self.attachments_found_first_plaintext: + # However, this will also *always* catch the MIME part added + # by majordomo with the footer. So if that one is present, + # we need to explicitly exclude it again. + try: + b = container.get_payload(decode=True) + except AssertionError: + # Badly encoded data can throw an exception here, where the python + # libraries fail to handle it and enters a cannot-happen path. + # In which case we just ignore this attachment. + return + + if isinstance(b, str) and not self._re_footer.match(b): + # We know there is no name for this one + self.attachments.append((None, container.get_content_type(), b)) + return + + # Ok, so this was a plaintext that we ignored. Set the flag + # that we have now ignored one, so we'll make the next one + # an attachment. + self.attachments_found_first_plaintext = True + # No name, and text/plain, so ignore it + + re_msgid = re.compile('^\s*<(.*)>\s*') + def clean_messageid(self, messageid, ignorebroken=False): + m = self.re_msgid.match(messageid) + if not m: + if ignorebroken: + log.status("Could not parse messageid '%s', ignoring it" % messageid) + return None + raise IgnorableException("Could not parse message id '%s'" % messageid) + return m.groups(1)[0].replace(' ','') + +# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$') + # Now using [^\s] instead of \w, to work with japanese chars + _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$') + _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$') + _date_multiminus_re = re.compile(' -(-\d+)$') + _date_offsetnoplus_re = re.compile(' (\d{4})$') + def forgiving_date_decode(self, d): + if d.strip() == '': + raise IgnorableException("Failed to parse empty date") + # Strange timezones requiring manual adjustments + if d.endswith('-7700 (EST)'): + d = d.replace('-7700 (EST)', 'EST') + if d.endswith('+6700 (EST)'): + d = d.replace('+6700 (EST)', 'EST') + if d.endswith('+-4-30'): + d = d.replace('+-4-30', '+0430') + if d.endswith('+1.00'): + d = d.replace('+1.00', '+0100') + if d.endswith('+-100'): + d = d.replace('+-100', '+0100') + if d.endswith('+500'): + d = d.replace('+500', '+0500') + if d.endswith('-500'): + d = d.replace('-500', '-0500') + if d.endswith('-700'): + d = d.replace('-700', '-0700') + if d.endswith('-800'): + d = d.replace('-800', '-0800') + if d.endswith('+05-30'): + d = d.replace('+05-30', '+0530') + if d.endswith('+0-900'): + d = d.replace('+0-900', '-0900') + if d.endswith('Mexico/General'): + d = d.replace('Mexico/General','CDT') + if d.endswith('Pacific Daylight Time'): + d = d.replace('Pacific Daylight Time', 'PDT') + if d.endswith(' ZE2'): + d = d.replace(' ZE2',' +0200') + if d.find('-Juin-') > 0: + d = d.replace('-Juin-','-Jun-') + if d.find('-Juil-') > 0: + d = d.replace('-Juil-','-Jul-') + if d.find(' 0 (GMT)'): + d = d.replace(' 0 (GMT)',' +0000') + + if self._date_multiminus_re.search(d): + d = self._date_multiminus_re.sub(' \\1', d) + + if self._date_offsetnoplus_re.search(d): + d = self._date_offsetnoplus_re.sub('+\\1', d) + + + # We have a number of dates in the format + # "<full datespace> +0200 (MET DST)" + # or similar. The problem coming from the space within the + # parenthesis, or if the contents of the parenthesis is + # completely empty + if self._date_multi_re.search(d): + d = self._date_multi_re.sub('', d) + + # If the spec is instead + # "<full datespace> +0200 (...)" + # of any kind, we can just remove what's in the (), because the + # parser is just going to rely on the fixed offset anyway. + if self._date_multi_re2.search(d): + d = self._date_multi_re2.sub(' \\1', d) + + try: + dp = dateutil.parser.parse(d, fuzzy=True) + + # Some offsets are >16 hours, which postgresql will not + # (for good reasons) accept + if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1: + # Convert it to a UTC timestamp using Python. It will give + # us the right time, but the wrong timezone. Should be + # enough... + dp = datetime.datetime(*dp.utctimetuple()[:6]) + return dp + except Exception as e: + raise IgnorableException("Failed to parse date '%s': %s" % (d, e)) + + def _maybe_decode(self, s, charset): + if isinstance(s, str): + return s.strip(' ') + return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ') + + # Workaround for broken quoting in some MUAs (see below) + _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE) + def _decode_mime_header(self, hdr, email_workaround): + if hdr == None: + return None + + # Per http://bugs.python.org/issue504152 (and lots of testing), it seems + # we must get rid of the sequence \n\t at least in the header. If we + # do this *before* doing any MIME decoding, we should be safe against + # anybody *actually* putting that sequence in the header (since we + # won't match the encoded contents) + hdr = hdr.replace("\n\t"," ") + + # In at least some cases, at least gmail (and possibly other MUAs) + # incorrectly put double quotes in the name/email field even when + # it's encoded. That's not allowed - they have to be escaped - but + # since there's a fair amount of those, we apply a regex to get + # rid of them. + m = self._re_mailworkaround.search(hdr) + if m: + hdr = self._re_mailworkaround.sub(r'\1', hdr) + + try: + return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)]) + except HeaderParseError as e: + # Parser error is typically someone specifying an encoding, + # but then not actually using that encoding. We'll do the best + # we can, which is cut it down to ascii and ignore errors + return str(hdr, 'us-ascii', errors='ignore').strip(' ') + + def decode_mime_header(self, hdr, email_workaround=False): + try: + if isinstance(hdr, Header): + hdr = hdr.encode() + + h = self._decode_mime_header(hdr, email_workaround) + if h: + return h.replace("\0", "") + return '' + except LookupError as e: + raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e)) + except ValueError as ve: + raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve)) + + def get_mandatory(self, fieldname): + try: + x = self.msg[fieldname] + if x==None: + raise Exception() + return x + except: + raise IgnorableException("Mandatory field '%s' is missing" % fieldname) + + def get_optional(self, fieldname): + try: + return self.msg[fieldname] + except: + return '' + + def html_clean(self, html): + # First we pass it through tidy + (html, errors) = tidylib.tidy_document(html, + options={ + 'drop-proprietary-attributes': 1, + 'alt-text': '', + 'hide-comments': 1, + 'output-xhtml': 1, + 'show-body-only': 1, + 'clean': 1, + 'char-encoding': 'utf8', + 'show-warnings': 0, + 'show-info': 0, + }) + if errors: + print(("HTML tidy failed for %s!" % self.msgid)) + print(errors) + return None + + try: + cleaner = HTMLCleaner() + cleaner.feed(html) + return cleaner.get_text() + except Exception as e: + # Failed to parse the html, thus failed to clean it. so we must + # give up... + return None class HTMLCleaner(HTMLParser): - def __init__(self): - HTMLParser.__init__(self) - self.io = io.StringIO() + def __init__(self): + HTMLParser.__init__(self) + self.io = io.StringIO() - def get_text(self): - return self.io.getvalue() + def get_text(self): + return self.io.getvalue() - def handle_data(self, data): - self.io.write(data) + def handle_data(self, data): + self.io.write(data) - def handle_starttag(self, tag, attrs): - if tag == "p" or tag == "br": - self.io.write("\n") + def handle_starttag(self, tag, attrs): + if tag == "p" or tag == "br": + self.io.write("\n") diff --git a/loader/lib/storage.py b/loader/lib/storage.py index 8962b87..2303fee 100644 --- a/loader/lib/storage.py +++ b/loader/lib/storage.py @@ -5,307 +5,307 @@ from .parser import ArchivesParser from lib.log import log, opstatus class ArchivesParserStorage(ArchivesParser): - def __init__(self): - super(ArchivesParserStorage, self).__init__() - self.purges = set() + def __init__(self): + super(ArchivesParserStorage, self).__init__() + self.purges = set() - def purge_list(self, listid, year, month): - self.purges.add((int(listid), int(year), int(month))) + def purge_list(self, listid, year, month): + self.purges.add((int(listid), int(year), int(month))) - def purge_thread(self, threadid): - self.purges.add(int(threadid)) + def purge_thread(self, threadid): + self.purges.add(int(threadid)) - def store(self, conn, listid, overwrite=False): - curs = conn.cursor() + def store(self, conn, listid, overwrite=False): + curs = conn.cursor() - # Potentially add the information that there exists a mail for - # this month. We do that this early since we're always going to - # make the check anyway, and this keeps the code in one place.. - if not overwrite: - curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", { - 'listid': listid, - 'year': self.date.year, - 'month': self.date.month, - }) + # Potentially add the information that there exists a mail for + # this month. We do that this early since we're always going to + # make the check anyway, and this keeps the code in one place.. + if not overwrite: + curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", { + 'listid': listid, + 'year': self.date.year, + 'month': self.date.month, + }) - curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", { - 'messageid': self.msgid, - 'listid': listid, - }) - r = curs.fetchall() - if len(r) > 0: - # Has to be 1 row, since we have a unique index on id - if not r[0][1] and not overwrite: - log.status("Tagging message %s with list %s" % (self.msgid, listid)) - curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", { - 'threadid': r[0][0], - 'listid': listid, - }) - opstatus.tagged += 1 - self.purge_list(listid, self.date.year, self.date.month) - self.purge_thread(r[0][0]) - else: - opstatus.dupes += 1 + curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", { + 'messageid': self.msgid, + 'listid': listid, + }) + r = curs.fetchall() + if len(r) > 0: + # Has to be 1 row, since we have a unique index on id + if not r[0][1] and not overwrite: + log.status("Tagging message %s with list %s" % (self.msgid, listid)) + curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", { + 'threadid': r[0][0], + 'listid': listid, + }) + opstatus.tagged += 1 + self.purge_list(listid, self.date.year, self.date.month) + self.purge_thread(r[0][0]) + else: + opstatus.dupes += 1 - if overwrite: - pk = r[0][2] - self.purge_thread(r[0][0]) - # Overwrite an existing message. We do not attempt to - # "re-thread" a message, we just update the contents. We - # do remove all attachments and rewrite them. Of course, we - # don't change the messageid (since it's our primary - # identifyer), and we don't update the raw text of the message. - # (since we are expected to have used that raw text to do - # the re-parsing initially) - # We update bodytext as a separate step so as not to rewrite - # the TOAST table unnecessarily... - curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", { - 'id': pk, - 'bodytxt': self.bodytxt, - }) - rc = curs.rowcount - curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", { - 'id': pk, - 'from': self._from, - 'to': self.to or '', - 'cc': self.cc or '', - 'subject': self.subject or '', - 'date': self.date, - 'has_attachment': len(self.attachments) > 0, - }) - rc += curs.rowcount - if rc == 0: - log.status("Message %s unchanged" % self.msgid) - return False + if overwrite: + pk = r[0][2] + self.purge_thread(r[0][0]) + # Overwrite an existing message. We do not attempt to + # "re-thread" a message, we just update the contents. We + # do remove all attachments and rewrite them. Of course, we + # don't change the messageid (since it's our primary + # identifyer), and we don't update the raw text of the message. + # (since we are expected to have used that raw text to do + # the re-parsing initially) + # We update bodytext as a separate step so as not to rewrite + # the TOAST table unnecessarily... + curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", { + 'id': pk, + 'bodytxt': self.bodytxt, + }) + rc = curs.rowcount + curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", { + 'id': pk, + 'from': self._from, + 'to': self.to or '', + 'cc': self.cc or '', + 'subject': self.subject or '', + 'date': self.date, + 'has_attachment': len(self.attachments) > 0, + }) + rc += curs.rowcount + if rc == 0: + log.status("Message %s unchanged" % self.msgid) + return False - curs.execute("DELETE FROM attachments WHERE message=%(message)s", { - 'message': pk, - }) - if len(self.attachments): - curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ { - 'message': pk, - 'filename': a[0] or 'unknown_filename', - 'contenttype': a[1], - 'attachment': bytearray(a[2]), - } for a in self.attachments]) - opstatus.overwritten += 1 - log.status("Message %s overwritten" % self.msgid) - else: - log.status("Message %s already stored" % self.msgid) - return True + curs.execute("DELETE FROM attachments WHERE message=%(message)s", { + 'message': pk, + }) + if len(self.attachments): + curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ { + 'message': pk, + 'filename': a[0] or 'unknown_filename', + 'contenttype': a[1], + 'attachment': bytearray(a[2]), + } for a in self.attachments]) + opstatus.overwritten += 1 + log.status("Message %s overwritten" % self.msgid) + else: + log.status("Message %s already stored" % self.msgid) + return True - if overwrite: - raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid)) - # Always purge the primary list for this thread - self.purge_list(listid, self.date.year, self.date.month) + if overwrite: + raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid)) + # Always purge the primary list for this thread + self.purge_list(listid, self.date.year, self.date.month) - # Resolve own thread - curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", { - 'parents': self.parents, - }) - all_parents = curs.fetchall() - if len(all_parents): - # At least one of the parents exist. Now try to figure out which one - best_parent = len(self.parents)+1 - best_threadid = -1 - best_parentid = None - for i in range(0,len(all_parents)): - for j in range(0,len(self.parents)): - if self.parents[j] == all_parents[i][1]: - # This messageid found. Better than the last one? - if j < best_parent: - best_parent = j - best_parentid = all_parents[i][0] - best_threadid = all_parents[i][2] - if best_threadid == -1: - raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid) - self.parentid = best_parentid - self.threadid = best_threadid - # Slice away all matches that are worse than the one we wanted - self.parents = self.parents[:best_parent] + # Resolve own thread + curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", { + 'parents': self.parents, + }) + all_parents = curs.fetchall() + if len(all_parents): + # At least one of the parents exist. Now try to figure out which one + best_parent = len(self.parents)+1 + best_threadid = -1 + best_parentid = None + for i in range(0,len(all_parents)): + for j in range(0,len(self.parents)): + if self.parents[j] == all_parents[i][1]: + # This messageid found. Better than the last one? + if j < best_parent: + best_parent = j + best_parentid = all_parents[i][0] + best_threadid = all_parents[i][2] + if best_threadid == -1: + raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid) + self.parentid = best_parentid + self.threadid = best_threadid + # Slice away all matches that are worse than the one we wanted + self.parents = self.parents[:best_parent] - log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents))) - else: - # No parent exist. But don't create the threadid just yet, since - # it's possible that we're somebody elses parent! - self.parentid = None - self.threadid = None + log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents))) + else: + # No parent exist. But don't create the threadid just yet, since + # it's possible that we're somebody elses parent! + self.parentid = None + self.threadid = None - # Now see if we are somebody elses *parent*... - curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", { - 'msgid': self.msgid, - }) - childrows = curs.fetchall() - if len(childrows): - # We are some already existing message's parent (meaning the - # messages arrived out of order) - # In the best case, the threadid is the same for all threads. - # But it might be different if this it the "glue message" that's - # holding other threads together. - if self.threadid: - # Already have a threadid, means that we have a glue message - print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid)) - else: - print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid) - # In this case, just pick the first thread from the list and merge into that - # one. - self.threadid = childrows[0][2] + # Now see if we are somebody elses *parent*... + curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", { + 'msgid': self.msgid, + }) + childrows = curs.fetchall() + if len(childrows): + # We are some already existing message's parent (meaning the + # messages arrived out of order) + # In the best case, the threadid is the same for all threads. + # But it might be different if this it the "glue message" that's + # holding other threads together. + if self.threadid: + # Already have a threadid, means that we have a glue message + print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid)) + else: + print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid) + # In this case, just pick the first thread from the list and merge into that + # one. + self.threadid = childrows[0][2] - # Get a unique list (set) of all threads *except* the primary one, - # because we'll be merging into that one. - mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,))) - if len(mergethreads): - # We have one or more merge threads - log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid)) - curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", { - 'threadid': self.threadid, - 'oldthreadids': list(mergethreads), - }) - # Insert any lists that were tagged on the merged threads - curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", { - 'threadid': self.threadid, - 'oldthreadids': list(mergethreads), - }) - # Remove all old leftovers - curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", { - 'oldthreadids': list(mergethreads), - }) - # Purge varnish records for all the threads we just removed - for t in mergethreads: - self.purge_thread(t) + # Get a unique list (set) of all threads *except* the primary one, + # because we'll be merging into that one. + mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,))) + if len(mergethreads): + # We have one or more merge threads + log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid)) + curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", { + 'threadid': self.threadid, + 'oldthreadids': list(mergethreads), + }) + # Insert any lists that were tagged on the merged threads + curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", { + 'threadid': self.threadid, + 'oldthreadids': list(mergethreads), + }) + # Remove all old leftovers + curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", { + 'oldthreadids': list(mergethreads), + }) + # Purge varnish records for all the threads we just removed + for t in mergethreads: + self.purge_thread(t) - # Batch all the children for repointing. We can't do the actual - # repointing until later, since we don't know our own id yet. - self.children = [r[0] for r in childrows] - log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % ( - self.children, mergethreads, childrows, self.threadid)) + # Batch all the children for repointing. We can't do the actual + # repointing until later, since we don't know our own id yet. + self.children = [r[0] for r in childrows] + log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % ( + self.children, mergethreads, childrows, self.threadid)) - # Finally, remove all the pending messages that had a higher - # priority value (meaning less important) than us - curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{ - 'msg': msg, - 'prio': prio, - } for msg, prio, tid in childrows]) - else: - self.children = [] + # Finally, remove all the pending messages that had a higher + # priority value (meaning less important) than us + curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{ + 'msg': msg, + 'prio': prio, + } for msg, prio, tid in childrows]) + else: + self.children = [] - if not self.threadid: - # No parent and no child exists - create a new threadid, just for us! - curs.execute("SELECT nextval('threadid_seq')") - self.threadid = curs.fetchall()[0][0] - log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid)) - else: - # We have a threadid already, so we're not a new thread. Thus, - # we need to purge the old thread - self.purge_thread(self.threadid) + if not self.threadid: + # No parent and no child exists - create a new threadid, just for us! + curs.execute("SELECT nextval('threadid_seq')") + self.threadid = curs.fetchall()[0][0] + log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid)) + else: + # We have a threadid already, so we're not a new thread. Thus, + # we need to purge the old thread + self.purge_thread(self.threadid) - # Insert a thread tag if we're on a new list - curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", { - 'threadid': self.threadid, - 'listid': listid, - }) - if len(curs.fetchall()): - log.status("Tagged thread %s with listid %s" % (self.threadid, listid)) + # Insert a thread tag if we're on a new list + curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", { + 'threadid': self.threadid, + 'listid': listid, + }) + if len(curs.fetchall()): + log.status("Tagged thread %s with listid %s" % (self.threadid, listid)) - curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", { - 'parentid': self.parentid, - 'threadid': self.threadid, - 'from': self._from, - 'to': self.to or '', - 'cc': self.cc or '', - 'subject': self.subject or '', - 'date': self.date, - 'has_attachment': len(self.attachments) > 0, - 'messageid': self.msgid, - 'bodytxt': self.bodytxt, - 'rawtxt': bytearray(self.rawtxt), - }) - id = curs.fetchall()[0][0] - log.status("Message %s, got id %s, set thread %s, parent %s" % ( - self.msgid, id, self.threadid, self.parentid)) - if len(self.attachments): - # Insert attachments - curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ { - 'message': id, - 'filename': a[0] or 'unknown_filename', - 'contenttype': a[1], - 'attachment': bytearray(a[2]), - } for a in self.attachments]) + curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", { + 'parentid': self.parentid, + 'threadid': self.threadid, + 'from': self._from, + 'to': self.to or '', + 'cc': self.cc or '', + 'subject': self.subject or '', + 'date': self.date, + 'has_attachment': len(self.attachments) > 0, + 'messageid': self.msgid, + 'bodytxt': self.bodytxt, + 'rawtxt': bytearray(self.rawtxt), + }) + id = curs.fetchall()[0][0] + log.status("Message %s, got id %s, set thread %s, parent %s" % ( + self.msgid, id, self.threadid, self.parentid)) + if len(self.attachments): + # Insert attachments + curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ { + 'message': id, + 'filename': a[0] or 'unknown_filename', + 'contenttype': a[1], + 'attachment': bytearray(a[2]), + } for a in self.attachments]) - if len(self.children): - log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid)) - curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s", - [{'parent': id, 'id': c} for c in self.children]) - if len(self.parents): - # There are remaining parents we'd rather have to get ourselves - # properly threaded - so store them in the db. - curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)", - [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))]) + if len(self.children): + log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid)) + curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s", + [{'parent': id, 'id': c} for c in self.children]) + if len(self.parents): + # There are remaining parents we'd rather have to get ourselves + # properly threaded - so store them in the db. + curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)", + [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))]) - opstatus.stored += 1 - return True + opstatus.stored += 1 + return True - def diff(self, conn, f, fromonlyf, oldid): - curs = conn.cursor() + def diff(self, conn, f, fromonlyf, oldid): + curs = conn.cursor() - # Fetch the old one so we have something to diff against - curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", { - 'msgid': self.msgid, - }) - try: - id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone() - except TypeError as e: - f.write("---- %s ----\n" % self.msgid) - f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e)) - f.write("\n-------------------------------\n\n") - return + # Fetch the old one so we have something to diff against + curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", { + 'msgid': self.msgid, + }) + try: + id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone() + except TypeError as e: + f.write("---- %s ----\n" % self.msgid) + f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e)) + f.write("\n-------------------------------\n\n") + return - if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject): - log.status("Message %s has header changes " % self.msgid) - f.write("==== %s ====\n" % self.msgid) - for fn in ['_from', 'to', 'cc', 'subject']: - if getattr(self, fn) != eval(fn): - s = "- {0}: {1}\n".format(fn, eval(fn)) - d = "+ {0}: {1}\n".format(fn, getattr(self, fn)) - f.write(s) - f.write(d) - f.write("\n\n") + if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject): + log.status("Message %s has header changes " % self.msgid) + f.write("==== %s ====\n" % self.msgid) + for fn in ['_from', 'to', 'cc', 'subject']: + if getattr(self, fn) != eval(fn): + s = "- {0}: {1}\n".format(fn, eval(fn)) + d = "+ {0}: {1}\n".format(fn, getattr(self, fn)) + f.write(s) + f.write(d) + f.write("\n\n") - if bodytxt != self.bodytxt: - log.status("Message %s has body changes " % self.msgid) - tempdiff = list(difflib.unified_diff(bodytxt.splitlines(), - self.bodytxt.splitlines(), - fromfile='old', - tofile='new', - n=0, - lineterm='')) - if (len(tempdiff)-2) % 3 == 0: - # 3 rows to a diff, two header rows. - # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From, - # which indicates the only change is in the From. - ok = True - tempdiff = tempdiff[2:] - while tempdiff: - a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0)) - if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')): - ok=False - break - if ok: - fromonlyf.write("%s\n" % self.msgid) - return + if bodytxt != self.bodytxt: + log.status("Message %s has body changes " % self.msgid) + tempdiff = list(difflib.unified_diff(bodytxt.splitlines(), + self.bodytxt.splitlines(), + fromfile='old', + tofile='new', + n=0, + lineterm='')) + if (len(tempdiff)-2) % 3 == 0: + # 3 rows to a diff, two header rows. + # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From, + # which indicates the only change is in the From. + ok = True + tempdiff = tempdiff[2:] + while tempdiff: + a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0)) + if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')): + ok=False + break + if ok: + fromonlyf.write("%s\n" % self.msgid) + return - # Generate a nicer diff - d = list(difflib.unified_diff(bodytxt.splitlines(), - self.bodytxt.splitlines(), - fromfile='old', - tofile='new', - n=0, - lineterm='')) - if len(d) > 0: - f.write("---- %s ----\n" % self.msgid) - f.write("\n".join(d)) - f.write("\n\n") - else: - log.status("Message %s unchanged." % self.msgid) + # Generate a nicer diff + d = list(difflib.unified_diff(bodytxt.splitlines(), + self.bodytxt.splitlines(), + fromfile='old', + tofile='new', + n=0, + lineterm='')) + if len(d) > 0: + f.write("---- %s ----\n" % self.msgid) + f.write("\n".join(d)) + f.write("\n\n") + else: + log.status("Message %s unchanged." % self.msgid) diff --git a/loader/lib/varnish.py b/loader/lib/varnish.py index f2a06c3..99d2d50 100644 --- a/loader/lib/varnish.py +++ b/loader/lib/varnish.py @@ -3,31 +3,31 @@ import requests from lib.log import log class VarnishPurger(object): - def __init__(self, cfg): - self.cfg = cfg + def __init__(self, cfg): + self.cfg = cfg - def purge(self, purges): - if not len(purges): - return + def purge(self, purges): + if not len(purges): + return - if not self.cfg.has_option('varnish', 'purgeurl'): - return + if not self.cfg.has_option('varnish', 'purgeurl'): + return - purgeurl = self.cfg.get('varnish', 'purgeurl') - exprlist = [] - for p in purges: - if isinstance(p, tuple): - # Purging a list - exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p) - else: - # Purging individual thread - exprlist.append('obj.http.x-pgthread ~ :%s:' % p) - purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist))) - purgedict['n'] = len(exprlist) - r = requests.post(purgeurl, data=purgedict, headers={ - 'Content-type': 'application/x-www-form-urlencoded', - 'Host': 'www.postgresql.org', - }) - if r.status_code != 200: - log.error("Failed to send purge request!") + purgeurl = self.cfg.get('varnish', 'purgeurl') + exprlist = [] + for p in purges: + if isinstance(p, tuple): + # Purging a list + exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p) + else: + # Purging individual thread + exprlist.append('obj.http.x-pgthread ~ :%s:' % p) + purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist))) + purgedict['n'] = len(exprlist) + r = requests.post(purgeurl, data=purgedict, headers={ + 'Content-type': 'application/x-www-form-urlencoded', + 'Host': 'www.postgresql.org', + }) + if r.status_code != 200: + log.error("Failed to send purge request!") |