encodingvalidator: fix utf16 surrogate pairs

seems like python does not think of them as errors
author: Marko Kreen 2011-06-20 13:03:33 +0000
committer: Marko Kreen 2011-06-20 13:03:33 +0000
commit: 35a9539921a067c9a579a1b4b5bd46d7b3371752 (patch)
tree: fa70e5f59b4bad803418d6ad1dce303f2524d830 /python
parent: 65003af1895cfdc004ad7abc09e3d4e02da6ac3a (diff)
1 files changed, 23 insertions, 0 deletions
diff --git a/python/londiste/handlers/dispatch.py b/python/londiste/handlers/dispatch.py
index 56fc964c..f861af72 100644
--- a/python/londiste/handlers/dispatch.py
+++ b/python/londiste/handlers/dispatch.py
@@ -144,6 +144,7 @@ import sys
 import datetime
 import new
 import codecs
+import re
 import skytools
 from londiste.handler import BaseHandler
 from skytools import quote_ident, quote_fqident, UsageError
@@ -578,6 +579,12 @@ ROW_HANDLERS = {'plain': RowHandler,
 # stores current EncodingValidator
 FIXENC_DATA = None
 
+# find UTF16 surrogate pairs
+_sgrc = re.compile(u"""
+            [\uD800-\uDBFF] [\uDC00-\uDFFF] ?
+          | [\uDC00-\uDFFF]
+      """, re.X)
+
 class EncodingValidator:
     def __init__(self, log, encoding = 'utf-8', replacement = u'\ufffd'):
         """validates the correctness of given encoding. when data contains 
@@ -597,12 +604,28 @@ class EncodingValidator:
         self.columns = columns
         self.error_count = 0
         _unicode = data.decode(self.encoding, "fixenc_error_handler")
+        # python does not tag surrogate pairs as error, fix them explicitly
+        _unicode = _sgrc.sub(self.sgfix, _unicode)
         # when no erros then return input data as is, else re-encode fixed data
         if self.error_count == 0:
             return data
         else:
             return _unicode.encode(self.encoding)
 
+    def sgfix(self, m):
+        """Fix  UTF16 surrogate pair"""
+        self.error_count += 1
+        val = m.group()
+        if len(val) == 2:
+            self.log.warning('combining utf16 surrogate pair')
+            c1 = ord(val[0])
+            c2 = ord(val[1])
+            c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
+            return unichr(c)
+        else:
+            self.log.warning('replacing utf16 surrogate code')
+            return self.replacement
+
     def validate_dict(self, data):
         """validates data in dict"""
         for _key, _val in data.items():
author	Marko Kreen	2011-06-20 13:03:33 +0000
committer	Marko Kreen	2011-06-20 13:03:33 +0000
commit	35a9539921a067c9a579a1b4b5bd46d7b3371752 (patch)
tree	fa70e5f59b4bad803418d6ad1dce303f2524d830 /python
parent	65003af1895cfdc004ad7abc09e3d4e02da6ac3a (diff)