summaryrefslogtreecommitdiff
path: root/loader
diff options
context:
space:
mode:
authorMagnus Hagander2019-01-08 10:05:14 +0000
committerMagnus Hagander2019-01-08 10:05:14 +0000
commit2bc1f82f8f8efea5b26fb28b987579b98d7ee171 (patch)
treee6470918c5834f55a4b4cf0e4ba6e2063944f708 /loader
parent420333a64ce22694f7899b2bacb54074dd66fa9b (diff)
Fix date parsing to be even more forgiving
In particular, if parsing the date either fails or if it results in a date that's in the future, fall back to parsing the dates out of the Received: headers instead, because at some point there we will find a parsable date for sure (if not before then when it hit one of our own servers)
Diffstat (limited to 'loader')
-rw-r--r--loader/lib/parser.py22
1 files changed, 22 insertions, 0 deletions
diff --git a/loader/lib/parser.py b/loader/lib/parser.py
index d22d8b0..c6ee98f 100644
--- a/loader/lib/parser.py
+++ b/loader/lib/parser.py
@@ -13,6 +13,8 @@ import io
from lib.exception import IgnorableException
from lib.log import log
+_re_received = re.compile('^from .*;([^(]+)(\s*\(envelope-from.*)?', re.I | re.DOTALL)
+
class ArchivesParser(object):
def __init__(self):
@@ -41,6 +43,24 @@ class ArchivesParser(object):
self.date = self.forgiving_date_decode(date_override)
else:
self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
+
+ # Accept times up to 4 hours in the future, for badly synced clocks
+ maxdate = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=4)
+ if self.date > maxdate:
+ # Date is in the future, we don't trust that. Instead, let's see if we can find
+ # it in the raw text of the message.
+ def _extract_date(d):
+ m = _re_received.match(d)
+ if m:
+ try:
+ return self.forgiving_date_decode(m.group(1).strip())
+ except IgnorableException:
+ pass
+
+ lowdate = min((x for x in map(_extract_date, self.msg.get_all('Received')) if x and x < maxdate))
+ if lowdate:
+ self.date = lowdate
+ # Else we're going to go with what we found
self.bodytxt = self.get_body()
self.attachments = []
self.get_attachments()
@@ -467,6 +487,8 @@ class ArchivesParser(object):
# us the right time, but the wrong timezone. Should be
# enough...
dp = datetime.datetime(*dp.utctimetuple()[:6])
+ if not dp.tzinfo:
+ dp = dp.replace(tzinfo=datetime.timezone.utc)
return dp
except Exception as e:
raise IgnorableException("Failed to parse date '%s': %s" % (d, e))