diff options
| author | Marko Kreen | 2011-06-20 13:03:33 +0000 |
|---|---|---|
| committer | Marko Kreen | 2011-06-20 13:03:33 +0000 |
| commit | 35a9539921a067c9a579a1b4b5bd46d7b3371752 (patch) | |
| tree | fa70e5f59b4bad803418d6ad1dce303f2524d830 /python | |
| parent | 65003af1895cfdc004ad7abc09e3d4e02da6ac3a (diff) | |
encodingvalidator: fix utf16 surrogate pairs
seems like python does not think of them as errors
Diffstat (limited to 'python')
| -rw-r--r-- | python/londiste/handlers/dispatch.py | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py index 56fc964c..f861af72 100644 --- a/python/londiste/handlers/dispatch.py +++ b/python/londiste/handlers/dispatch.py @@ -144,6 +144,7 @@ import sys import datetime import new import codecs +import re import skytools from londiste.handler import BaseHandler from skytools import quote_ident, quote_fqident, UsageError @@ -578,6 +579,12 @@ ROW_HANDLERS = {'plain': RowHandler, # stores current EncodingValidator FIXENC_DATA = None +# find UTF16 surrogate pairs +_sgrc = re.compile(u""" + [\uD800-\uDBFF] [\uDC00-\uDFFF] ? + | [\uDC00-\uDFFF] + """, re.X) + class EncodingValidator: def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'): """validates the correctness of given encoding. when data contains @@ -597,12 +604,28 @@ class EncodingValidator: self.columns = columns self.error_count = 0 _unicode = data.decode(self.encoding, "fixenc_error_handler") + # python does not tag surrogate pairs as error, fix them explicitly + _unicode = _sgrc.sub(self.sgfix, _unicode) # when no erros then return input data as is, else re-encode fixed data if self.error_count == 0: return data else: return _unicode.encode(self.encoding) + def sgfix(self, m): + """Fix UTF16 surrogate pair""" + self.error_count += 1 + val = m.group() + if len(val) == 2: + self.log.warning('combining utf16 surrogate pair') + c1 = ord(val[0]) + c2 = ord(val[1]) + c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + return unichr(c) + else: + self.log.warning('replacing utf16 surrogate code') + return self.replacement + def validate_dict(self, data): """validates data in dict""" for _key, _val in data.items(): |
