summaryrefslogtreecommitdiff
path: root/loader/lib
diff options
context:
space:
mode:
authorMagnus Hagander2019-01-03 20:15:38 +0000
committerMagnus Hagander2019-01-04 11:24:06 +0000
commit3fb227230c145c828888aa2e7c5d8b9a8c0760a0 (patch)
tree2eeba42da158e6e00c06d537dc6da3a895e39b6c /loader/lib
parent1e173c362aa105ab4397fb77f8c693a1e01efa11 (diff)
Tabs to 4 spaces
pep8 standard for indentation
Diffstat (limited to 'loader/lib')
-rw-r--r--loader/lib/exception.py2
-rw-r--r--loader/lib/log.py44
-rw-r--r--loader/lib/mbox.py68
-rw-r--r--loader/lib/parser.py1102
-rw-r--r--loader/lib/storage.py556
-rw-r--r--loader/lib/varnish.py48
6 files changed, 910 insertions, 910 deletions
diff --git a/loader/lib/exception.py b/loader/lib/exception.py
index 02172a2..54a9efc 100644
--- a/loader/lib/exception.py
+++ b/loader/lib/exception.py
@@ -1,2 +1,2 @@
class IgnorableException(Exception):
- pass
+ pass
diff --git a/loader/lib/log.py b/loader/lib/log.py
index 5b6379a..48722c9 100644
--- a/loader/lib/log.py
+++ b/loader/lib/log.py
@@ -1,33 +1,33 @@
class Log(object):
- def __init__(self):
- self.verbose = False
+ def __init__(self):
+ self.verbose = False
- def set(self, verbose):
- self.verbose = verbose
+ def set(self, verbose):
+ self.verbose = verbose
- def status(self, msg):
- if self.verbose:
- print(msg)
+ def status(self, msg):
+ if self.verbose:
+ print(msg)
- def log(self, msg):
- print(msg)
+ def log(self, msg):
+ print(msg)
- def error(self, msg):
- print(msg)
+ def error(self, msg):
+ print(msg)
- def print_status(self):
- opstatus.print_status()
+ def print_status(self):
+ opstatus.print_status()
class OpStatus(object):
- def __init__(self):
- self.stored = 0
- self.dupes = 0
- self.tagged = 0
- self.failed = 0
- self.overwritten = 0
-
- def print_status(self):
- print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten))
+ def __init__(self):
+ self.stored = 0
+ self.dupes = 0
+ self.tagged = 0
+ self.failed = 0
+ self.overwritten = 0
+
+ def print_status(self):
+ print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten))
log = Log()
diff --git a/loader/lib/mbox.py b/loader/lib/mbox.py
index 77c83b0..c097e72 100644
--- a/loader/lib/mbox.py
+++ b/loader/lib/mbox.py
@@ -11,41 +11,41 @@ SEPARATOR = "ABCARCHBREAK123" * 50
bSEPARATOR = bytes(SEPARATOR, 'ascii')
class MailboxBreakupParser(object):
- def __init__(self, fn):
- self.EOF = False
+ def __init__(self, fn):
+ self.EOF = False
- if fn.endswith(".gz"):
- cat = "zcat"
- else:
- cat = "cat"
- cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR)
- self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
+ if fn.endswith(".gz"):
+ cat = "zcat"
+ else:
+ cat = "cat"
+ cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR)
+ self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
- def returncode(self):
- self.pipe.wait()
- return self.pipe.returncode
+ def returncode(self):
+ self.pipe.wait()
+ return self.pipe.returncode
- def stderr_output(self):
- return self.pipe.stderr.read()
+ def stderr_output(self):
+ return self.pipe.stderr.read()
- def __next__(self):
- sio = BytesIO()
- while True:
- try:
- l = next(self.pipe.stdout)
- except StopIteration:
- # End of file!
- self.EOF = True
- if sio.tell() == 0:
- # Nothing read yet, so return None instead of an empty
- # bytesio
- return None
- sio.seek(0)
- return sio
- if l.rstrip() == bSEPARATOR:
- # Reached a separator. Meaning we're not at end of file,
- # but we're at end of message.
- sio.seek(0)
- return sio
- # Otherwise, append it to where we are now
- sio.write(l)
+ def __next__(self):
+ sio = BytesIO()
+ while True:
+ try:
+ l = next(self.pipe.stdout)
+ except StopIteration:
+ # End of file!
+ self.EOF = True
+ if sio.tell() == 0:
+ # Nothing read yet, so return None instead of an empty
+ # bytesio
+ return None
+ sio.seek(0)
+ return sio
+ if l.rstrip() == bSEPARATOR:
+ # Reached a separator. Meaning we're not at end of file,
+ # but we're at end of message.
+ sio.seek(0)
+ return sio
+ # Otherwise, append it to where we are now
+ sio.write(l)
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index b97c8b3..81192da 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -14,560 +14,560 @@ from lib.exception import IgnorableException
from lib.log import log
class ArchivesParser(object):
- def __init__(self):
- self.parser = BytesParser(policy=compat32)
-
- def parse(self, stream):
- self.rawtxt = stream.read()
- self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
-
- def is_msgid(self, msgid):
- # Look for a specific messageid. This means we might parse it twice,
- # but so be it. Any exception means we know it's not this one...
- try:
- if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
- return True
- except Exception as e:
- return False
-
- def analyze(self, date_override=None):
- self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
- self._from = self.decode_mime_header(self.get_mandatory('From'), True)
- self.to = self.decode_mime_header(self.get_optional('To'), True)
- self.cc = self.decode_mime_header(self.get_optional('CC'), True)
- self.subject = self.decode_mime_header(self.get_optional('Subject'))
- if date_override:
- self.date = self.forgiving_date_decode(date_override)
- else:
- self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
- self.bodytxt = self.get_body()
- self.attachments = []
- self.get_attachments()
- if len(self.attachments) > 0:
- log.status("Found %s attachments" % len(self.attachments))
-
- # Build an list of the message id's we are interested in
- self.parents = []
- # The first one is in-reply-to, if it exists
- if self.get_optional('in-reply-to'):
- m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
- if m:
- self.parents.append(m)
-
- # Then we add all References values, in backwards order
- if self.get_optional('references'):
- cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
- # Can't do this with a simple self.parents.extend() due to broken
- # mailers that add the same reference more than once. And we can't
- # use a set() to make it unique, because order is very important
- for m in cleaned_msgids:
- if m and not m in self.parents:
- self.parents.append(m)
-
-
- def clean_charset(self, charset):
- lcharset = charset.lower()
- if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
- # Special case where we don't know... We'll assume
- # us-ascii and use replacements
- return 'us-ascii'
- if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
- # Seriously broken charset definitions, map to us-ascii
- # and throw away the rest with replacements
- return 'us-ascii'
- if lcharset == 'x-gbk':
- # Some MUAs set it to x-gbk, but there is a valid
- # declaratoin as gbk...
- return 'gbk'
- if lcharset == 'iso-8859-8-i':
- # -I is a special logical version, but should be the
- # same charset
- return 'iso-8859-8'
- if lcharset == 'windows-874':
- # This is an alias for iso-8859-11
- return 'iso-8859-11'
- if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
- # Strange way of saying 8859....
- return 'iso-8859-1'
- if lcharset == 'iso885915':
- return 'iso-8859-15'
- if lcharset == 'iso-latin-2':
- return 'iso-8859-2'
- if lcharset == 'iso-850':
- # Strange spelling of cp850 (windows charset)
- return 'cp850'
- if lcharset == 'koi8r':
- return 'koi8-r'
- if lcharset == 'cp 1252':
- return 'cp1252'
- if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
- # Why did this show up more than once?!
- return 'iso-8859-1'
- if lcharset == 'x-windows-949':
- return 'ms949'
- if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
- # This is a locale, and not a charset, but most likely it's this one
- return 'iso-8859-1'
- if lcharset == 'iso-8858-15':
- # How is this a *common* mistake?
- return 'iso-8859-15'
- if lcharset == 'macintosh':
- return 'mac_roman'
- if lcharset == 'cn-big5':
- return 'big5'
- if lcharset == 'x-unicode-2-0-utf-7':
- return 'utf-7'
- if lcharset == 'tscii':
- # No support for this charset :S Map it down to ascii
- # and throw away all the rest. sucks, but we have to
- return 'us-ascii'
- return charset
-
- def get_payload_as_unicode(self, msg):
- try:
- b = msg.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore it and hope for a better MIME part later.
- b = None
-
- if b:
- # Find out if there is a charset
- charset = None
- params = msg.get_params()
- if not params:
- # No content-type, so we assume us-ascii
- return str(b, 'us-ascii', errors='ignore')
- for k,v in params:
- if k.lower() == 'charset':
- charset = v
- break
- if charset:
- try:
- return str(b, self.clean_charset(charset), errors='ignore')
- except LookupError as e:
- raise IgnorableException("Failed to get unicode payload: %s" % e)
- else:
- # XXX: reasonable default?
- return str(b, errors='ignore')
- # Return None or empty string, depending on what we got back
- return b
-
- # Regular expression matching the PostgreSQL custom mail footer that
- # is appended to all emails.
- _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
- def get_body(self):
- b = self._get_body()
- if b:
- # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
- # later reject..
- if b.find('\udbff\n\udef8'):
- b = b.replace('\udbff\n\udef8', '')
-
- # Remove postgres specific mail footer - if it's there
- m = self._re_footer.match(b)
- if m:
- b = m.group(1)
-
- # Sometimes we end up with a trailing \0 when decoding long strings, so
- # replace it if it's there.
- # In fact, replace it everywhere, since it can also turn up in the middle
- # of a text when it's a really broken decoding.
- b = b.replace('\0', '')
-
- return b
-
- def _get_body(self):
- # This is where the magic happens - try to figure out what the body
- # of this message should render as.
- hasempty = False
-
- # First see if this is a single-part message that we can just
- # decode and go.
- b = self.get_payload_as_unicode(self.msg)
- if b: return b
- if b == '':
- # We found something, but it was empty. We'll keep looking as
- # there might be something better available, but make a note
- # that empty exists.
- hasempty = True
-
- # Ok, it's multipart. Find the first part that is text/plain,
- # and use that one. Do this recursively, since we may have something
- # like:
- # multipart/mixed:
- # multipart/alternative:
- # text/plain
- # text/html
- # application/octet-stream (attachment)
- b = self.recursive_first_plaintext(self.msg)
- if b: return b
- if b == '':
- hasempty = True
-
- # Couldn't find a plaintext. Look for the first HTML in that case.
- # Fallback, but what can we do at this point...
- b = self.recursive_first_plaintext(self.msg, True)
- if b:
- b = self.html_clean(b)
- if b: return b
- if b == '' or b is None:
- hasempty = True
-
- if hasempty:
- log.status('Found empty body in %s' % self.msgid)
- return ''
- raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
-
- def recursive_first_plaintext(self, container, html_instead=False):
- pl = container.get_payload()
- if isinstance(pl, str):
- # This was not a multipart, but it leaked... Give up!
- return None
- for p in pl:
- if p.get_params() == None:
- # MIME multipart/mixed, but no MIME type on the part
- log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
- return self.get_payload_as_unicode(p)
- if p.get_params()[0][0].lower() == 'text/plain':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if html_instead and p.get_params()[0][0].lower() == 'text/html':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if p.is_multipart():
- b = self.recursive_first_plaintext(p, html_instead)
- if b or b == '': return b
-
- # Yikes, nothing here! Hopefully we'll find something when
- # we continue looping at a higher level.
- return None
-
- def get_attachments(self):
- self.attachments_found_first_plaintext = False
- self.recursive_get_attachments(self.msg)
-
- # Clean a filenames encoding and return it as a unicode string
- def _clean_filename_encoding(self, filename):
- # If this is a header-encoded filename, start by decoding that
- if filename.startswith('=?'):
- decoded, encoding = decode_header(filename)[0]
- return str(decoded, encoding, errors='ignore')
-
- # If it's already unicode, just return it
- if isinstance(filename, str):
- return filename
-
- # Anything that's not UTF8, we just get rid of. We can live with
- # filenames slightly mangled in this case.
- return str(filename, 'utf-8', errors='ignore')
-
- def _extract_filename(self, container):
- # Try to get the filename for an attachment in the container.
- # If the standard library can figure one out, use that one.
- f = container.get_filename()
- if f: return self._clean_filename_encoding(f)
-
- # Failing that, some mailers set Content-Description to the
- # filename
- if 'Content-Description' in container:
- return self._clean_filename_encoding(container['Content-Description'])
- return None
-
- def recursive_get_attachments(self, container):
- # We start recursion in the "multipart" container if any
- if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
- # Multipart - worth scanning into
- if not container.is_multipart():
- # Wow, this is broken. It's multipart/mixed, but doesn't
- # contain multiple parts.
- # Since we're just looking for attachments, let's just
- # ignore it...
- return
- for p in container.get_payload():
- if p.get_params() == None:
- continue
- self.recursive_get_attachments(p)
- elif container.get_content_type() == 'multipart/alternative':
- # Alternative is not an attachment (we decide)
- # It's typilcally plantext + html
- self.attachments_found_first_plaintext = True
- return
- elif container.is_multipart():
- # Other kinds of multipart, such as multipart/signed...
- return
- else:
- # Not a multipart.
- # Exclude specific contenttypes
- if container.get_content_type() == 'application/pgp-signature':
- return
- if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
- return
- # For now, accept anything not text/plain
- if container.get_content_type() != 'text/plain':
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
- return
-
- # It's a text/plain, it might be worthwhile.
- # If it has a name, we consider it an attachments
- if not container.get_params():
- return
- for k,v in container.get_params():
- if k=='name' and v != '':
- # Yes, it has a name
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If it's content-disposition=attachment, we also want to save it
- if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If we have already found one text/plain part, make all
- # further text/plain parts attachments
- if self.attachments_found_first_plaintext:
- # However, this will also *always* catch the MIME part added
- # by majordomo with the footer. So if that one is present,
- # we need to explicitly exclude it again.
- try:
- b = container.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- if isinstance(b, str) and not self._re_footer.match(b):
- # We know there is no name for this one
- self.attachments.append((None, container.get_content_type(), b))
- return
-
- # Ok, so this was a plaintext that we ignored. Set the flag
- # that we have now ignored one, so we'll make the next one
- # an attachment.
- self.attachments_found_first_plaintext = True
- # No name, and text/plain, so ignore it
-
- re_msgid = re.compile('^\s*<(.*)>\s*')
- def clean_messageid(self, messageid, ignorebroken=False):
- m = self.re_msgid.match(messageid)
- if not m:
- if ignorebroken:
- log.status("Could not parse messageid '%s', ignoring it" % messageid)
- return None
- raise IgnorableException("Could not parse message id '%s'" % messageid)
- return m.groups(1)[0].replace(' ','')
-
-# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
- # Now using [^\s] instead of \w, to work with japanese chars
- _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
- _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
- _date_multiminus_re = re.compile(' -(-\d+)$')
- _date_offsetnoplus_re = re.compile(' (\d{4})$')
- def forgiving_date_decode(self, d):
- if d.strip() == '':
- raise IgnorableException("Failed to parse empty date")
- # Strange timezones requiring manual adjustments
- if d.endswith('-7700 (EST)'):
- d = d.replace('-7700 (EST)', 'EST')
- if d.endswith('+6700 (EST)'):
- d = d.replace('+6700 (EST)', 'EST')
- if d.endswith('+-4-30'):
- d = d.replace('+-4-30', '+0430')
- if d.endswith('+1.00'):
- d = d.replace('+1.00', '+0100')
- if d.endswith('+-100'):
- d = d.replace('+-100', '+0100')
- if d.endswith('+500'):
- d = d.replace('+500', '+0500')
- if d.endswith('-500'):
- d = d.replace('-500', '-0500')
- if d.endswith('-700'):
- d = d.replace('-700', '-0700')
- if d.endswith('-800'):
- d = d.replace('-800', '-0800')
- if d.endswith('+05-30'):
- d = d.replace('+05-30', '+0530')
- if d.endswith('+0-900'):
- d = d.replace('+0-900', '-0900')
- if d.endswith('Mexico/General'):
- d = d.replace('Mexico/General','CDT')
- if d.endswith('Pacific Daylight Time'):
- d = d.replace('Pacific Daylight Time', 'PDT')
- if d.endswith(' ZE2'):
- d = d.replace(' ZE2',' +0200')
- if d.find('-Juin-') > 0:
- d = d.replace('-Juin-','-Jun-')
- if d.find('-Juil-') > 0:
- d = d.replace('-Juil-','-Jul-')
- if d.find(' 0 (GMT)'):
- d = d.replace(' 0 (GMT)',' +0000')
-
- if self._date_multiminus_re.search(d):
- d = self._date_multiminus_re.sub(' \\1', d)
-
- if self._date_offsetnoplus_re.search(d):
- d = self._date_offsetnoplus_re.sub('+\\1', d)
-
-
- # We have a number of dates in the format
- # "<full datespace> +0200 (MET DST)"
- # or similar. The problem coming from the space within the
- # parenthesis, or if the contents of the parenthesis is
- # completely empty
- if self._date_multi_re.search(d):
- d = self._date_multi_re.sub('', d)
-
- # If the spec is instead
- # "<full datespace> +0200 (...)"
- # of any kind, we can just remove what's in the (), because the
- # parser is just going to rely on the fixed offset anyway.
- if self._date_multi_re2.search(d):
- d = self._date_multi_re2.sub(' \\1', d)
-
- try:
- dp = dateutil.parser.parse(d, fuzzy=True)
-
- # Some offsets are >16 hours, which postgresql will not
- # (for good reasons) accept
- if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
- # Convert it to a UTC timestamp using Python. It will give
- # us the right time, but the wrong timezone. Should be
- # enough...
- dp = datetime.datetime(*dp.utctimetuple()[:6])
- return dp
- except Exception as e:
- raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
-
- def _maybe_decode(self, s, charset):
- if isinstance(s, str):
- return s.strip(' ')
- return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
-
- # Workaround for broken quoting in some MUAs (see below)
- _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
- def _decode_mime_header(self, hdr, email_workaround):
- if hdr == None:
- return None
-
- # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
- # we must get rid of the sequence \n\t at least in the header. If we
- # do this *before* doing any MIME decoding, we should be safe against
- # anybody *actually* putting that sequence in the header (since we
- # won't match the encoded contents)
- hdr = hdr.replace("\n\t"," ")
-
- # In at least some cases, at least gmail (and possibly other MUAs)
- # incorrectly put double quotes in the name/email field even when
- # it's encoded. That's not allowed - they have to be escaped - but
- # since there's a fair amount of those, we apply a regex to get
- # rid of them.
- m = self._re_mailworkaround.search(hdr)
- if m:
- hdr = self._re_mailworkaround.sub(r'\1', hdr)
-
- try:
- return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
- except HeaderParseError as e:
- # Parser error is typically someone specifying an encoding,
- # but then not actually using that encoding. We'll do the best
- # we can, which is cut it down to ascii and ignore errors
- return str(hdr, 'us-ascii', errors='ignore').strip(' ')
-
- def decode_mime_header(self, hdr, email_workaround=False):
- try:
- if isinstance(hdr, Header):
- hdr = hdr.encode()
-
- h = self._decode_mime_header(hdr, email_workaround)
- if h:
- return h.replace("\0", "")
- return ''
- except LookupError as e:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
- except ValueError as ve:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
-
- def get_mandatory(self, fieldname):
- try:
- x = self.msg[fieldname]
- if x==None:
- raise Exception()
- return x
- except:
- raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
-
- def get_optional(self, fieldname):
- try:
- return self.msg[fieldname]
- except:
- return ''
-
- def html_clean(self, html):
- # First we pass it through tidy
- (html, errors) = tidylib.tidy_document(html,
- options={
- 'drop-proprietary-attributes': 1,
- 'alt-text': '',
- 'hide-comments': 1,
- 'output-xhtml': 1,
- 'show-body-only': 1,
- 'clean': 1,
- 'char-encoding': 'utf8',
- 'show-warnings': 0,
- 'show-info': 0,
- })
- if errors:
- print(("HTML tidy failed for %s!" % self.msgid))
- print(errors)
- return None
-
- try:
- cleaner = HTMLCleaner()
- cleaner.feed(html)
- return cleaner.get_text()
- except Exception as e:
- # Failed to parse the html, thus failed to clean it. so we must
- # give up...
- return None
+ def __init__(self):
+ self.parser = BytesParser(policy=compat32)
+
+ def parse(self, stream):
+ self.rawtxt = stream.read()
+ self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
+
+ def is_msgid(self, msgid):
+ # Look for a specific messageid. This means we might parse it twice,
+ # but so be it. Any exception means we know it's not this one...
+ try:
+ if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
+ return True
+ except Exception as e:
+ return False
+
+ def analyze(self, date_override=None):
+ self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
+ self._from = self.decode_mime_header(self.get_mandatory('From'), True)
+ self.to = self.decode_mime_header(self.get_optional('To'), True)
+ self.cc = self.decode_mime_header(self.get_optional('CC'), True)
+ self.subject = self.decode_mime_header(self.get_optional('Subject'))
+ if date_override:
+ self.date = self.forgiving_date_decode(date_override)
+ else:
+ self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
+ self.bodytxt = self.get_body()
+ self.attachments = []
+ self.get_attachments()
+ if len(self.attachments) > 0:
+ log.status("Found %s attachments" % len(self.attachments))
+
+ # Build an list of the message id's we are interested in
+ self.parents = []
+ # The first one is in-reply-to, if it exists
+ if self.get_optional('in-reply-to'):
+ m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
+ if m:
+ self.parents.append(m)
+
+ # Then we add all References values, in backwards order
+ if self.get_optional('references'):
+ cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
+ # Can't do this with a simple self.parents.extend() due to broken
+ # mailers that add the same reference more than once. And we can't
+ # use a set() to make it unique, because order is very important
+ for m in cleaned_msgids:
+ if m and not m in self.parents:
+ self.parents.append(m)
+
+
+ def clean_charset(self, charset):
+ lcharset = charset.lower()
+ if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
+ # Special case where we don't know... We'll assume
+ # us-ascii and use replacements
+ return 'us-ascii'
+ if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
+ # Seriously broken charset definitions, map to us-ascii
+ # and throw away the rest with replacements
+ return 'us-ascii'
+ if lcharset == 'x-gbk':
+ # Some MUAs set it to x-gbk, but there is a valid
+ # declaratoin as gbk...
+ return 'gbk'
+ if lcharset == 'iso-8859-8-i':
+ # -I is a special logical version, but should be the
+ # same charset
+ return 'iso-8859-8'
+ if lcharset == 'windows-874':
+ # This is an alias for iso-8859-11
+ return 'iso-8859-11'
+ if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
+ # Strange way of saying 8859....
+ return 'iso-8859-1'
+ if lcharset == 'iso885915':
+ return 'iso-8859-15'
+ if lcharset == 'iso-latin-2':
+ return 'iso-8859-2'
+ if lcharset == 'iso-850':
+ # Strange spelling of cp850 (windows charset)
+ return 'cp850'
+ if lcharset == 'koi8r':
+ return 'koi8-r'
+ if lcharset == 'cp 1252':
+ return 'cp1252'
+ if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
+ # Why did this show up more than once?!
+ return 'iso-8859-1'
+ if lcharset == 'x-windows-949':
+ return 'ms949'
+ if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
+ # This is a locale, and not a charset, but most likely it's this one
+ return 'iso-8859-1'
+ if lcharset == 'iso-8858-15':
+ # How is this a *common* mistake?
+ return 'iso-8859-15'
+ if lcharset == 'macintosh':
+ return 'mac_roman'
+ if lcharset == 'cn-big5':
+ return 'big5'
+ if lcharset == 'x-unicode-2-0-utf-7':
+ return 'utf-7'
+ if lcharset == 'tscii':
+ # No support for this charset :S Map it down to ascii
+ # and throw away all the rest. sucks, but we have to
+ return 'us-ascii'
+ return charset
+
+ def get_payload_as_unicode(self, msg):
+ try:
+ b = msg.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore it and hope for a better MIME part later.
+ b = None
+
+ if b:
+ # Find out if there is a charset
+ charset = None
+ params = msg.get_params()
+ if not params:
+ # No content-type, so we assume us-ascii
+ return str(b, 'us-ascii', errors='ignore')
+ for k,v in params:
+ if k.lower() == 'charset':
+ charset = v
+ break
+ if charset:
+ try:
+ return str(b, self.clean_charset(charset), errors='ignore')
+ except LookupError as e:
+ raise IgnorableException("Failed to get unicode payload: %s" % e)
+ else:
+ # XXX: reasonable default?
+ return str(b, errors='ignore')
+ # Return None or empty string, depending on what we got back
+ return b
+
+ # Regular expression matching the PostgreSQL custom mail footer that
+ # is appended to all emails.
+ _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
+ def get_body(self):
+ b = self._get_body()
+ if b:
+ # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
+ # later reject..
+ if b.find('\udbff\n\udef8'):
+ b = b.replace('\udbff\n\udef8', '')
+
+ # Remove postgres specific mail footer - if it's there
+ m = self._re_footer.match(b)
+ if m:
+ b = m.group(1)
+
+ # Sometimes we end up with a trailing \0 when decoding long strings, so
+ # replace it if it's there.
+ # In fact, replace it everywhere, since it can also turn up in the middle
+ # of a text when it's a really broken decoding.
+ b = b.replace('\0', '')
+
+ return b
+
+ def _get_body(self):
+ # This is where the magic happens - try to figure out what the body
+ # of this message should render as.
+ hasempty = False
+
+ # First see if this is a single-part message that we can just
+ # decode and go.
+ b = self.get_payload_as_unicode(self.msg)
+ if b: return b
+ if b == '':
+ # We found something, but it was empty. We'll keep looking as
+ # there might be something better available, but make a note
+ # that empty exists.
+ hasempty = True
+
+ # Ok, it's multipart. Find the first part that is text/plain,
+ # and use that one. Do this recursively, since we may have something
+ # like:
+ # multipart/mixed:
+ # multipart/alternative:
+ # text/plain
+ # text/html
+ # application/octet-stream (attachment)
+ b = self.recursive_first_plaintext(self.msg)
+ if b: return b
+ if b == '':
+ hasempty = True
+
+ # Couldn't find a plaintext. Look for the first HTML in that case.
+ # Fallback, but what can we do at this point...
+ b = self.recursive_first_plaintext(self.msg, True)
+ if b:
+ b = self.html_clean(b)
+ if b: return b
+ if b == '' or b is None:
+ hasempty = True
+
+ if hasempty:
+ log.status('Found empty body in %s' % self.msgid)
+ return ''
+ raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
+
+ def recursive_first_plaintext(self, container, html_instead=False):
+ pl = container.get_payload()
+ if isinstance(pl, str):
+ # This was not a multipart, but it leaked... Give up!
+ return None
+ for p in pl:
+ if p.get_params() == None:
+ # MIME multipart/mixed, but no MIME type on the part
+ log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
+ return self.get_payload_as_unicode(p)
+ if p.get_params()[0][0].lower() == 'text/plain':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if html_instead and p.get_params()[0][0].lower() == 'text/html':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if p.is_multipart():
+ b = self.recursive_first_plaintext(p, html_instead)
+ if b or b == '': return b
+
+ # Yikes, nothing here! Hopefully we'll find something when
+ # we continue looping at a higher level.
+ return None
+
+ def get_attachments(self):
+ self.attachments_found_first_plaintext = False
+ self.recursive_get_attachments(self.msg)
+
+ # Clean a filenames encoding and return it as a unicode string
+ def _clean_filename_encoding(self, filename):
+ # If this is a header-encoded filename, start by decoding that
+ if filename.startswith('=?'):
+ decoded, encoding = decode_header(filename)[0]
+ return str(decoded, encoding, errors='ignore')
+
+ # If it's already unicode, just return it
+ if isinstance(filename, str):
+ return filename
+
+ # Anything that's not UTF8, we just get rid of. We can live with
+ # filenames slightly mangled in this case.
+ return str(filename, 'utf-8', errors='ignore')
+
+ def _extract_filename(self, container):
+ # Try to get the filename for an attachment in the container.
+ # If the standard library can figure one out, use that one.
+ f = container.get_filename()
+ if f: return self._clean_filename_encoding(f)
+
+ # Failing that, some mailers set Content-Description to the
+ # filename
+ if 'Content-Description' in container:
+ return self._clean_filename_encoding(container['Content-Description'])
+ return None
+
+ def recursive_get_attachments(self, container):
+ # We start recursion in the "multipart" container if any
+ if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
+ # Multipart - worth scanning into
+ if not container.is_multipart():
+ # Wow, this is broken. It's multipart/mixed, but doesn't
+ # contain multiple parts.
+ # Since we're just looking for attachments, let's just
+ # ignore it...
+ return
+ for p in container.get_payload():
+ if p.get_params() == None:
+ continue
+ self.recursive_get_attachments(p)
+ elif container.get_content_type() == 'multipart/alternative':
+ # Alternative is not an attachment (we decide)
+ # It's typilcally plantext + html
+ self.attachments_found_first_plaintext = True
+ return
+ elif container.is_multipart():
+ # Other kinds of multipart, such as multipart/signed...
+ return
+ else:
+ # Not a multipart.
+ # Exclude specific contenttypes
+ if container.get_content_type() == 'application/pgp-signature':
+ return
+ if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
+ return
+ # For now, accept anything not text/plain
+ if container.get_content_type() != 'text/plain':
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+ return
+
+ # It's a text/plain, it might be worthwhile.
+ # If it has a name, we consider it an attachments
+ if not container.get_params():
+ return
+ for k,v in container.get_params():
+ if k=='name' and v != '':
+ # Yes, it has a name
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If it's content-disposition=attachment, we also want to save it
+ if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If we have already found one text/plain part, make all
+ # further text/plain parts attachments
+ if self.attachments_found_first_plaintext:
+ # However, this will also *always* catch the MIME part added
+ # by majordomo with the footer. So if that one is present,
+ # we need to explicitly exclude it again.
+ try:
+ b = container.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ if isinstance(b, str) and not self._re_footer.match(b):
+ # We know there is no name for this one
+ self.attachments.append((None, container.get_content_type(), b))
+ return
+
+ # Ok, so this was a plaintext that we ignored. Set the flag
+ # that we have now ignored one, so we'll make the next one
+ # an attachment.
+ self.attachments_found_first_plaintext = True
+ # No name, and text/plain, so ignore it
+
+ re_msgid = re.compile('^\s*<(.*)>\s*')
+ def clean_messageid(self, messageid, ignorebroken=False):
+ m = self.re_msgid.match(messageid)
+ if not m:
+ if ignorebroken:
+ log.status("Could not parse messageid '%s', ignoring it" % messageid)
+ return None
+ raise IgnorableException("Could not parse message id '%s'" % messageid)
+ return m.groups(1)[0].replace(' ','')
+
+# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
+ # Now using [^\s] instead of \w, to work with japanese chars
+ _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
+ _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
+ _date_multiminus_re = re.compile(' -(-\d+)$')
+ _date_offsetnoplus_re = re.compile(' (\d{4})$')
+ def forgiving_date_decode(self, d):
+ if d.strip() == '':
+ raise IgnorableException("Failed to parse empty date")
+ # Strange timezones requiring manual adjustments
+ if d.endswith('-7700 (EST)'):
+ d = d.replace('-7700 (EST)', 'EST')
+ if d.endswith('+6700 (EST)'):
+ d = d.replace('+6700 (EST)', 'EST')
+ if d.endswith('+-4-30'):
+ d = d.replace('+-4-30', '+0430')
+ if d.endswith('+1.00'):
+ d = d.replace('+1.00', '+0100')
+ if d.endswith('+-100'):
+ d = d.replace('+-100', '+0100')
+ if d.endswith('+500'):
+ d = d.replace('+500', '+0500')
+ if d.endswith('-500'):
+ d = d.replace('-500', '-0500')
+ if d.endswith('-700'):
+ d = d.replace('-700', '-0700')
+ if d.endswith('-800'):
+ d = d.replace('-800', '-0800')
+ if d.endswith('+05-30'):
+ d = d.replace('+05-30', '+0530')
+ if d.endswith('+0-900'):
+ d = d.replace('+0-900', '-0900')
+ if d.endswith('Mexico/General'):
+ d = d.replace('Mexico/General','CDT')
+ if d.endswith('Pacific Daylight Time'):
+ d = d.replace('Pacific Daylight Time', 'PDT')
+ if d.endswith(' ZE2'):
+ d = d.replace(' ZE2',' +0200')
+ if d.find('-Juin-') > 0:
+ d = d.replace('-Juin-','-Jun-')
+ if d.find('-Juil-') > 0:
+ d = d.replace('-Juil-','-Jul-')
+ if d.find(' 0 (GMT)'):
+ d = d.replace(' 0 (GMT)',' +0000')
+
+ if self._date_multiminus_re.search(d):
+ d = self._date_multiminus_re.sub(' \\1', d)
+
+ if self._date_offsetnoplus_re.search(d):
+ d = self._date_offsetnoplus_re.sub('+\\1', d)
+
+
+ # We have a number of dates in the format
+ # "<full datespace> +0200 (MET DST)"
+ # or similar. The problem coming from the space within the
+ # parenthesis, or if the contents of the parenthesis is
+ # completely empty
+ if self._date_multi_re.search(d):
+ d = self._date_multi_re.sub('', d)
+
+ # If the spec is instead
+ # "<full datespace> +0200 (...)"
+ # of any kind, we can just remove what's in the (), because the
+ # parser is just going to rely on the fixed offset anyway.
+ if self._date_multi_re2.search(d):
+ d = self._date_multi_re2.sub(' \\1', d)
+
+ try:
+ dp = dateutil.parser.parse(d, fuzzy=True)
+
+ # Some offsets are >16 hours, which postgresql will not
+ # (for good reasons) accept
+ if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
+ # Convert it to a UTC timestamp using Python. It will give
+ # us the right time, but the wrong timezone. Should be
+ # enough...
+ dp = datetime.datetime(*dp.utctimetuple()[:6])
+ return dp
+ except Exception as e:
+ raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
+
+ def _maybe_decode(self, s, charset):
+ if isinstance(s, str):
+ return s.strip(' ')
+ return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
+
+ # Workaround for broken quoting in some MUAs (see below)
+ _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
+ def _decode_mime_header(self, hdr, email_workaround):
+ if hdr == None:
+ return None
+
+ # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
+ # we must get rid of the sequence \n\t at least in the header. If we
+ # do this *before* doing any MIME decoding, we should be safe against
+ # anybody *actually* putting that sequence in the header (since we
+ # won't match the encoded contents)
+ hdr = hdr.replace("\n\t"," ")
+
+ # In at least some cases, at least gmail (and possibly other MUAs)
+ # incorrectly put double quotes in the name/email field even when
+ # it's encoded. That's not allowed - they have to be escaped - but
+ # since there's a fair amount of those, we apply a regex to get
+ # rid of them.
+ m = self._re_mailworkaround.search(hdr)
+ if m:
+ hdr = self._re_mailworkaround.sub(r'\1', hdr)
+
+ try:
+ return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
+ except HeaderParseError as e:
+ # Parser error is typically someone specifying an encoding,
+ # but then not actually using that encoding. We'll do the best
+ # we can, which is cut it down to ascii and ignore errors
+ return str(hdr, 'us-ascii', errors='ignore').strip(' ')
+
+ def decode_mime_header(self, hdr, email_workaround=False):
+ try:
+ if isinstance(hdr, Header):
+ hdr = hdr.encode()
+
+ h = self._decode_mime_header(hdr, email_workaround)
+ if h:
+ return h.replace("\0", "")
+ return ''
+ except LookupError as e:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
+ except ValueError as ve:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
+
+ def get_mandatory(self, fieldname):
+ try:
+ x = self.msg[fieldname]
+ if x==None:
+ raise Exception()
+ return x
+ except:
+ raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
+
+ def get_optional(self, fieldname):
+ try:
+ return self.msg[fieldname]
+ except:
+ return ''
+
+ def html_clean(self, html):
+ # First we pass it through tidy
+ (html, errors) = tidylib.tidy_document(html,
+ options={
+ 'drop-proprietary-attributes': 1,
+ 'alt-text': '',
+ 'hide-comments': 1,
+ 'output-xhtml': 1,
+ 'show-body-only': 1,
+ 'clean': 1,
+ 'char-encoding': 'utf8',
+ 'show-warnings': 0,
+ 'show-info': 0,
+ })
+ if errors:
+ print(("HTML tidy failed for %s!" % self.msgid))
+ print(errors)
+ return None
+
+ try:
+ cleaner = HTMLCleaner()
+ cleaner.feed(html)
+ return cleaner.get_text()
+ except Exception as e:
+ # Failed to parse the html, thus failed to clean it. so we must
+ # give up...
+ return None
class HTMLCleaner(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.io = io.StringIO()
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.io = io.StringIO()
- def get_text(self):
- return self.io.getvalue()
+ def get_text(self):
+ return self.io.getvalue()
- def handle_data(self, data):
- self.io.write(data)
+ def handle_data(self, data):
+ self.io.write(data)
- def handle_starttag(self, tag, attrs):
- if tag == "p" or tag == "br":
- self.io.write("\n")
+ def handle_starttag(self, tag, attrs):
+ if tag == "p" or tag == "br":
+ self.io.write("\n")
diff --git a/loader/lib/storage.py b/loader/lib/storage.py
index 8962b87..2303fee 100644
--- a/loader/lib/storage.py
+++ b/loader/lib/storage.py
@@ -5,307 +5,307 @@ from .parser import ArchivesParser
from lib.log import log, opstatus
class ArchivesParserStorage(ArchivesParser):
- def __init__(self):
- super(ArchivesParserStorage, self).__init__()
- self.purges = set()
+ def __init__(self):
+ super(ArchivesParserStorage, self).__init__()
+ self.purges = set()
- def purge_list(self, listid, year, month):
- self.purges.add((int(listid), int(year), int(month)))
+ def purge_list(self, listid, year, month):
+ self.purges.add((int(listid), int(year), int(month)))
- def purge_thread(self, threadid):
- self.purges.add(int(threadid))
+ def purge_thread(self, threadid):
+ self.purges.add(int(threadid))
- def store(self, conn, listid, overwrite=False):
- curs = conn.cursor()
+ def store(self, conn, listid, overwrite=False):
+ curs = conn.cursor()
- # Potentially add the information that there exists a mail for
- # this month. We do that this early since we're always going to
- # make the check anyway, and this keeps the code in one place..
- if not overwrite:
- curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", {
- 'listid': listid,
- 'year': self.date.year,
- 'month': self.date.month,
- })
+ # Potentially add the information that there exists a mail for
+ # this month. We do that this early since we're always going to
+ # make the check anyway, and this keeps the code in one place..
+ if not overwrite:
+ curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", {
+ 'listid': listid,
+ 'year': self.date.year,
+ 'month': self.date.month,
+ })
- curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", {
- 'messageid': self.msgid,
- 'listid': listid,
- })
- r = curs.fetchall()
- if len(r) > 0:
- # Has to be 1 row, since we have a unique index on id
- if not r[0][1] and not overwrite:
- log.status("Tagging message %s with list %s" % (self.msgid, listid))
- curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", {
- 'threadid': r[0][0],
- 'listid': listid,
- })
- opstatus.tagged += 1
- self.purge_list(listid, self.date.year, self.date.month)
- self.purge_thread(r[0][0])
- else:
- opstatus.dupes += 1
+ curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", {
+ 'messageid': self.msgid,
+ 'listid': listid,
+ })
+ r = curs.fetchall()
+ if len(r) > 0:
+ # Has to be 1 row, since we have a unique index on id
+ if not r[0][1] and not overwrite:
+ log.status("Tagging message %s with list %s" % (self.msgid, listid))
+ curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", {
+ 'threadid': r[0][0],
+ 'listid': listid,
+ })
+ opstatus.tagged += 1
+ self.purge_list(listid, self.date.year, self.date.month)
+ self.purge_thread(r[0][0])
+ else:
+ opstatus.dupes += 1
- if overwrite:
- pk = r[0][2]
- self.purge_thread(r[0][0])
- # Overwrite an existing message. We do not attempt to
- # "re-thread" a message, we just update the contents. We
- # do remove all attachments and rewrite them. Of course, we
- # don't change the messageid (since it's our primary
- # identifyer), and we don't update the raw text of the message.
- # (since we are expected to have used that raw text to do
- # the re-parsing initially)
- # We update bodytext as a separate step so as not to rewrite
- # the TOAST table unnecessarily...
- curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", {
- 'id': pk,
- 'bodytxt': self.bodytxt,
- })
- rc = curs.rowcount
- curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", {
- 'id': pk,
- 'from': self._from,
- 'to': self.to or '',
- 'cc': self.cc or '',
- 'subject': self.subject or '',
- 'date': self.date,
- 'has_attachment': len(self.attachments) > 0,
- })
- rc += curs.rowcount
- if rc == 0:
- log.status("Message %s unchanged" % self.msgid)
- return False
+ if overwrite:
+ pk = r[0][2]
+ self.purge_thread(r[0][0])
+ # Overwrite an existing message. We do not attempt to
+ # "re-thread" a message, we just update the contents. We
+ # do remove all attachments and rewrite them. Of course, we
+ # don't change the messageid (since it's our primary
+ # identifyer), and we don't update the raw text of the message.
+ # (since we are expected to have used that raw text to do
+ # the re-parsing initially)
+ # We update bodytext as a separate step so as not to rewrite
+ # the TOAST table unnecessarily...
+ curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", {
+ 'id': pk,
+ 'bodytxt': self.bodytxt,
+ })
+ rc = curs.rowcount
+ curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", {
+ 'id': pk,
+ 'from': self._from,
+ 'to': self.to or '',
+ 'cc': self.cc or '',
+ 'subject': self.subject or '',
+ 'date': self.date,
+ 'has_attachment': len(self.attachments) > 0,
+ })
+ rc += curs.rowcount
+ if rc == 0:
+ log.status("Message %s unchanged" % self.msgid)
+ return False
- curs.execute("DELETE FROM attachments WHERE message=%(message)s", {
- 'message': pk,
- })
- if len(self.attachments):
- curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
- 'message': pk,
- 'filename': a[0] or 'unknown_filename',
- 'contenttype': a[1],
- 'attachment': bytearray(a[2]),
- } for a in self.attachments])
- opstatus.overwritten += 1
- log.status("Message %s overwritten" % self.msgid)
- else:
- log.status("Message %s already stored" % self.msgid)
- return True
+ curs.execute("DELETE FROM attachments WHERE message=%(message)s", {
+ 'message': pk,
+ })
+ if len(self.attachments):
+ curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
+ 'message': pk,
+ 'filename': a[0] or 'unknown_filename',
+ 'contenttype': a[1],
+ 'attachment': bytearray(a[2]),
+ } for a in self.attachments])
+ opstatus.overwritten += 1
+ log.status("Message %s overwritten" % self.msgid)
+ else:
+ log.status("Message %s already stored" % self.msgid)
+ return True
- if overwrite:
- raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid))
- # Always purge the primary list for this thread
- self.purge_list(listid, self.date.year, self.date.month)
+ if overwrite:
+ raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid))
+ # Always purge the primary list for this thread
+ self.purge_list(listid, self.date.year, self.date.month)
- # Resolve own thread
- curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", {
- 'parents': self.parents,
- })
- all_parents = curs.fetchall()
- if len(all_parents):
- # At least one of the parents exist. Now try to figure out which one
- best_parent = len(self.parents)+1
- best_threadid = -1
- best_parentid = None
- for i in range(0,len(all_parents)):
- for j in range(0,len(self.parents)):
- if self.parents[j] == all_parents[i][1]:
- # This messageid found. Better than the last one?
- if j < best_parent:
- best_parent = j
- best_parentid = all_parents[i][0]
- best_threadid = all_parents[i][2]
- if best_threadid == -1:
- raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid)
- self.parentid = best_parentid
- self.threadid = best_threadid
- # Slice away all matches that are worse than the one we wanted
- self.parents = self.parents[:best_parent]
+ # Resolve own thread
+ curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", {
+ 'parents': self.parents,
+ })
+ all_parents = curs.fetchall()
+ if len(all_parents):
+ # At least one of the parents exist. Now try to figure out which one
+ best_parent = len(self.parents)+1
+ best_threadid = -1
+ best_parentid = None
+ for i in range(0,len(all_parents)):
+ for j in range(0,len(self.parents)):
+ if self.parents[j] == all_parents[i][1]:
+ # This messageid found. Better than the last one?
+ if j < best_parent:
+ best_parent = j
+ best_parentid = all_parents[i][0]
+ best_threadid = all_parents[i][2]
+ if best_threadid == -1:
+ raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid)
+ self.parentid = best_parentid
+ self.threadid = best_threadid
+ # Slice away all matches that are worse than the one we wanted
+ self.parents = self.parents[:best_parent]
- log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents)))
- else:
- # No parent exist. But don't create the threadid just yet, since
- # it's possible that we're somebody elses parent!
- self.parentid = None
- self.threadid = None
+ log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents)))
+ else:
+ # No parent exist. But don't create the threadid just yet, since
+ # it's possible that we're somebody elses parent!
+ self.parentid = None
+ self.threadid = None
- # Now see if we are somebody elses *parent*...
- curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", {
- 'msgid': self.msgid,
- })
- childrows = curs.fetchall()
- if len(childrows):
- # We are some already existing message's parent (meaning the
- # messages arrived out of order)
- # In the best case, the threadid is the same for all threads.
- # But it might be different if this it the "glue message" that's
- # holding other threads together.
- if self.threadid:
- # Already have a threadid, means that we have a glue message
- print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid))
- else:
- print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid)
- # In this case, just pick the first thread from the list and merge into that
- # one.
- self.threadid = childrows[0][2]
+ # Now see if we are somebody elses *parent*...
+ curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", {
+ 'msgid': self.msgid,
+ })
+ childrows = curs.fetchall()
+ if len(childrows):
+ # We are some already existing message's parent (meaning the
+ # messages arrived out of order)
+ # In the best case, the threadid is the same for all threads.
+ # But it might be different if this it the "glue message" that's
+ # holding other threads together.
+ if self.threadid:
+ # Already have a threadid, means that we have a glue message
+ print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid))
+ else:
+ print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid)
+ # In this case, just pick the first thread from the list and merge into that
+ # one.
+ self.threadid = childrows[0][2]
- # Get a unique list (set) of all threads *except* the primary one,
- # because we'll be merging into that one.
- mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,)))
- if len(mergethreads):
- # We have one or more merge threads
- log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid))
- curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", {
- 'threadid': self.threadid,
- 'oldthreadids': list(mergethreads),
- })
- # Insert any lists that were tagged on the merged threads
- curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", {
- 'threadid': self.threadid,
- 'oldthreadids': list(mergethreads),
- })
- # Remove all old leftovers
- curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", {
- 'oldthreadids': list(mergethreads),
- })
- # Purge varnish records for all the threads we just removed
- for t in mergethreads:
- self.purge_thread(t)
+ # Get a unique list (set) of all threads *except* the primary one,
+ # because we'll be merging into that one.
+ mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,)))
+ if len(mergethreads):
+ # We have one or more merge threads
+ log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid))
+ curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", {
+ 'threadid': self.threadid,
+ 'oldthreadids': list(mergethreads),
+ })
+ # Insert any lists that were tagged on the merged threads
+ curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", {
+ 'threadid': self.threadid,
+ 'oldthreadids': list(mergethreads),
+ })
+ # Remove all old leftovers
+ curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", {
+ 'oldthreadids': list(mergethreads),
+ })
+ # Purge varnish records for all the threads we just removed
+ for t in mergethreads:
+ self.purge_thread(t)
- # Batch all the children for repointing. We can't do the actual
- # repointing until later, since we don't know our own id yet.
- self.children = [r[0] for r in childrows]
- log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % (
- self.children, mergethreads, childrows, self.threadid))
+ # Batch all the children for repointing. We can't do the actual
+ # repointing until later, since we don't know our own id yet.
+ self.children = [r[0] for r in childrows]
+ log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % (
+ self.children, mergethreads, childrows, self.threadid))
- # Finally, remove all the pending messages that had a higher
- # priority value (meaning less important) than us
- curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{
- 'msg': msg,
- 'prio': prio,
- } for msg, prio, tid in childrows])
- else:
- self.children = []
+ # Finally, remove all the pending messages that had a higher
+ # priority value (meaning less important) than us
+ curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{
+ 'msg': msg,
+ 'prio': prio,
+ } for msg, prio, tid in childrows])
+ else:
+ self.children = []
- if not self.threadid:
- # No parent and no child exists - create a new threadid, just for us!
- curs.execute("SELECT nextval('threadid_seq')")
- self.threadid = curs.fetchall()[0][0]
- log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid))
- else:
- # We have a threadid already, so we're not a new thread. Thus,
- # we need to purge the old thread
- self.purge_thread(self.threadid)
+ if not self.threadid:
+ # No parent and no child exists - create a new threadid, just for us!
+ curs.execute("SELECT nextval('threadid_seq')")
+ self.threadid = curs.fetchall()[0][0]
+ log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid))
+ else:
+ # We have a threadid already, so we're not a new thread. Thus,
+ # we need to purge the old thread
+ self.purge_thread(self.threadid)
- # Insert a thread tag if we're on a new list
- curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", {
- 'threadid': self.threadid,
- 'listid': listid,
- })
- if len(curs.fetchall()):
- log.status("Tagged thread %s with listid %s" % (self.threadid, listid))
+ # Insert a thread tag if we're on a new list
+ curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", {
+ 'threadid': self.threadid,
+ 'listid': listid,
+ })
+ if len(curs.fetchall()):
+ log.status("Tagged thread %s with listid %s" % (self.threadid, listid))
- curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", {
- 'parentid': self.parentid,
- 'threadid': self.threadid,
- 'from': self._from,
- 'to': self.to or '',
- 'cc': self.cc or '',
- 'subject': self.subject or '',
- 'date': self.date,
- 'has_attachment': len(self.attachments) > 0,
- 'messageid': self.msgid,
- 'bodytxt': self.bodytxt,
- 'rawtxt': bytearray(self.rawtxt),
- })
- id = curs.fetchall()[0][0]
- log.status("Message %s, got id %s, set thread %s, parent %s" % (
- self.msgid, id, self.threadid, self.parentid))
- if len(self.attachments):
- # Insert attachments
- curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
- 'message': id,
- 'filename': a[0] or 'unknown_filename',
- 'contenttype': a[1],
- 'attachment': bytearray(a[2]),
- } for a in self.attachments])
+ curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", {
+ 'parentid': self.parentid,
+ 'threadid': self.threadid,
+ 'from': self._from,
+ 'to': self.to or '',
+ 'cc': self.cc or '',
+ 'subject': self.subject or '',
+ 'date': self.date,
+ 'has_attachment': len(self.attachments) > 0,
+ 'messageid': self.msgid,
+ 'bodytxt': self.bodytxt,
+ 'rawtxt': bytearray(self.rawtxt),
+ })
+ id = curs.fetchall()[0][0]
+ log.status("Message %s, got id %s, set thread %s, parent %s" % (
+ self.msgid, id, self.threadid, self.parentid))
+ if len(self.attachments):
+ # Insert attachments
+ curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
+ 'message': id,
+ 'filename': a[0] or 'unknown_filename',
+ 'contenttype': a[1],
+ 'attachment': bytearray(a[2]),
+ } for a in self.attachments])
- if len(self.children):
- log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid))
- curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s",
- [{'parent': id, 'id': c} for c in self.children])
- if len(self.parents):
- # There are remaining parents we'd rather have to get ourselves
- # properly threaded - so store them in the db.
- curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)",
- [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))])
+ if len(self.children):
+ log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid))
+ curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s",
+ [{'parent': id, 'id': c} for c in self.children])
+ if len(self.parents):
+ # There are remaining parents we'd rather have to get ourselves
+ # properly threaded - so store them in the db.
+ curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)",
+ [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))])
- opstatus.stored += 1
- return True
+ opstatus.stored += 1
+ return True
- def diff(self, conn, f, fromonlyf, oldid):
- curs = conn.cursor()
+ def diff(self, conn, f, fromonlyf, oldid):
+ curs = conn.cursor()
- # Fetch the old one so we have something to diff against
- curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': self.msgid,
- })
- try:
- id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone()
- except TypeError as e:
- f.write("---- %s ----\n" % self.msgid)
- f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e))
- f.write("\n-------------------------------\n\n")
- return
+ # Fetch the old one so we have something to diff against
+ curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': self.msgid,
+ })
+ try:
+ id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone()
+ except TypeError as e:
+ f.write("---- %s ----\n" % self.msgid)
+ f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e))
+ f.write("\n-------------------------------\n\n")
+ return
- if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject):
- log.status("Message %s has header changes " % self.msgid)
- f.write("==== %s ====\n" % self.msgid)
- for fn in ['_from', 'to', 'cc', 'subject']:
- if getattr(self, fn) != eval(fn):
- s = "- {0}: {1}\n".format(fn, eval(fn))
- d = "+ {0}: {1}\n".format(fn, getattr(self, fn))
- f.write(s)
- f.write(d)
- f.write("\n\n")
+ if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject):
+ log.status("Message %s has header changes " % self.msgid)
+ f.write("==== %s ====\n" % self.msgid)
+ for fn in ['_from', 'to', 'cc', 'subject']:
+ if getattr(self, fn) != eval(fn):
+ s = "- {0}: {1}\n".format(fn, eval(fn))
+ d = "+ {0}: {1}\n".format(fn, getattr(self, fn))
+ f.write(s)
+ f.write(d)
+ f.write("\n\n")
- if bodytxt != self.bodytxt:
- log.status("Message %s has body changes " % self.msgid)
- tempdiff = list(difflib.unified_diff(bodytxt.splitlines(),
- self.bodytxt.splitlines(),
- fromfile='old',
- tofile='new',
- n=0,
- lineterm=''))
- if (len(tempdiff)-2) % 3 == 0:
- # 3 rows to a diff, two header rows.
- # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From,
- # which indicates the only change is in the From.
- ok = True
- tempdiff = tempdiff[2:]
- while tempdiff:
- a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0))
- if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')):
- ok=False
- break
- if ok:
- fromonlyf.write("%s\n" % self.msgid)
- return
+ if bodytxt != self.bodytxt:
+ log.status("Message %s has body changes " % self.msgid)
+ tempdiff = list(difflib.unified_diff(bodytxt.splitlines(),
+ self.bodytxt.splitlines(),
+ fromfile='old',
+ tofile='new',
+ n=0,
+ lineterm=''))
+ if (len(tempdiff)-2) % 3 == 0:
+ # 3 rows to a diff, two header rows.
+ # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From,
+ # which indicates the only change is in the From.
+ ok = True
+ tempdiff = tempdiff[2:]
+ while tempdiff:
+ a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0))
+ if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')):
+ ok=False
+ break
+ if ok:
+ fromonlyf.write("%s\n" % self.msgid)
+ return
- # Generate a nicer diff
- d = list(difflib.unified_diff(bodytxt.splitlines(),
- self.bodytxt.splitlines(),
- fromfile='old',
- tofile='new',
- n=0,
- lineterm=''))
- if len(d) > 0:
- f.write("---- %s ----\n" % self.msgid)
- f.write("\n".join(d))
- f.write("\n\n")
- else:
- log.status("Message %s unchanged." % self.msgid)
+ # Generate a nicer diff
+ d = list(difflib.unified_diff(bodytxt.splitlines(),
+ self.bodytxt.splitlines(),
+ fromfile='old',
+ tofile='new',
+ n=0,
+ lineterm=''))
+ if len(d) > 0:
+ f.write("---- %s ----\n" % self.msgid)
+ f.write("\n".join(d))
+ f.write("\n\n")
+ else:
+ log.status("Message %s unchanged." % self.msgid)
diff --git a/loader/lib/varnish.py b/loader/lib/varnish.py
index f2a06c3..99d2d50 100644
--- a/loader/lib/varnish.py
+++ b/loader/lib/varnish.py
@@ -3,31 +3,31 @@ import requests
from lib.log import log
class VarnishPurger(object):
- def __init__(self, cfg):
- self.cfg = cfg
+ def __init__(self, cfg):
+ self.cfg = cfg
- def purge(self, purges):
- if not len(purges):
- return
+ def purge(self, purges):
+ if not len(purges):
+ return
- if not self.cfg.has_option('varnish', 'purgeurl'):
- return
+ if not self.cfg.has_option('varnish', 'purgeurl'):
+ return
- purgeurl = self.cfg.get('varnish', 'purgeurl')
- exprlist = []
- for p in purges:
- if isinstance(p, tuple):
- # Purging a list
- exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p)
- else:
- # Purging individual thread
- exprlist.append('obj.http.x-pgthread ~ :%s:' % p)
- purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist)))
- purgedict['n'] = len(exprlist)
- r = requests.post(purgeurl, data=purgedict, headers={
- 'Content-type': 'application/x-www-form-urlencoded',
- 'Host': 'www.postgresql.org',
- })
- if r.status_code != 200:
- log.error("Failed to send purge request!")
+ purgeurl = self.cfg.get('varnish', 'purgeurl')
+ exprlist = []
+ for p in purges:
+ if isinstance(p, tuple):
+ # Purging a list
+ exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p)
+ else:
+ # Purging individual thread
+ exprlist.append('obj.http.x-pgthread ~ :%s:' % p)
+ purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist)))
+ purgedict['n'] = len(exprlist)
+ r = requests.post(purgeurl, data=purgedict, headers={
+ 'Content-type': 'application/x-www-form-urlencoded',
+ 'Host': 'www.postgresql.org',
+ })
+ if r.status_code != 200:
+ log.error("Failed to send purge request!")