summaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorMarko Kreen2011-06-20 13:03:33 +0000
committerMarko Kreen2011-06-20 13:03:33 +0000
commit35a9539921a067c9a579a1b4b5bd46d7b3371752 (patch)
treefa70e5f59b4bad803418d6ad1dce303f2524d830 /python
parent65003af1895cfdc004ad7abc09e3d4e02da6ac3a (diff)
encodingvalidator: fix utf16 surrogate pairs
seems like python does not think of them as errors
Diffstat (limited to 'python')
-rw-r--r--python/londiste/handlers/dispatch.py23
1 files changed, 23 insertions, 0 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py
index 56fc964c..f861af72 100644
--- a/python/londiste/handlers/dispatch.py
+++ b/python/londiste/handlers/dispatch.py
@@ -144,6 +144,7 @@ import sys
import datetime
import new
import codecs
+import re
import skytools
from londiste.handler import BaseHandler
from skytools import quote_ident, quote_fqident, UsageError
@@ -578,6 +579,12 @@ ROW_HANDLERS = {'plain': RowHandler,
# stores current EncodingValidator
FIXENC_DATA = None
+# find UTF16 surrogate pairs
+_sgrc = re.compile(u"""
+ [\uD800-\uDBFF] [\uDC00-\uDFFF] ?
+ | [\uDC00-\uDFFF]
+ """, re.X)
+
class EncodingValidator:
def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'):
"""validates the correctness of given encoding. when data contains
@@ -597,12 +604,28 @@ class EncodingValidator:
self.columns = columns
self.error_count = 0
_unicode = data.decode(self.encoding, "fixenc_error_handler")
+ # python does not tag surrogate pairs as error, fix them explicitly
+ _unicode = _sgrc.sub(self.sgfix, _unicode)
# when no erros then return input data as is, else re-encode fixed data
if self.error_count == 0:
return data
else:
return _unicode.encode(self.encoding)
+ def sgfix(self, m):
+ """Fix UTF16 surrogate pair"""
+ self.error_count += 1
+ val = m.group()
+ if len(val) == 2:
+ self.log.warning('combining utf16 surrogate pair')
+ c1 = ord(val[0])
+ c2 = ord(val[1])
+ c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
+ return unichr(c)
+ else:
+ self.log.warning('replacing utf16 surrogate code')
+ return self.replacement
+
def validate_dict(self, data):
"""validates data in dict"""
for _key, _val in data.items():