1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
r"""UTF-8 sanitizer.
Python's UTF-8 parser is quite relaxed, this creates problems when
talking with other software that uses stricter parsers.
>>> safe_utf8_decode("foobar")
(True, u'foobar')
>>> safe_utf8_decode('X\xed\xa0\x80Y\xed\xb0\x89Z')
(False, u'X\ufffdY\ufffdZ')
>>> safe_utf8_decode('X\xed\xa0\x80\xed\xb0\x89Z')
(False, u'X\U00010009Z')
>>> safe_utf8_decode('X\0Z')
(False, u'X\ufffdZ')
>>> safe_utf8_decode('OK')
(True, u'OK')
"""
import re
__all__ = ['safe_utf8_decode']
# by default, use same symbol as 'replace'
REPLACEMENT_SYMBOL = unichr(0xFFFD)
def _fix_utf8(m):
"""Merge UTF16 surrogates, replace others"""
u = m.group()
if len(u) == 2:
# merge into single symbol
c1 = ord(u[0])
c2 = ord(u[1])
c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
return unichr(c)
else:
# use replacement symbol
return REPLACEMENT_SYMBOL
_urc = None
def sanitize_unicode(u):
"""Fix invalid symbols in unicode string."""
global _urc
assert isinstance(u, unicode)
# regex for finding invalid chars, works on unicode string
if not _urc:
rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]"
_urc = re.compile(rx, re.X)
# now find and fix UTF16 surrogates
m = _urc.search(u)
if m:
u = _urc.sub(_fix_utf8, u)
return u
def safe_utf8_decode(s):
"""Decode UTF-8 safely.
Acts like str.decode('utf8', 'replace') but also fixes
UTF16 surrogates and NUL bytes, which Python's default
decoder does not do.
@param s: utf8-encoded byte string
@return: tuple of (was_valid_utf8, unicode_string)
"""
# decode with error detection
ok = True
try:
# expect no errors by default
u = s.decode('utf8')
except UnicodeDecodeError:
u = s.decode('utf8', 'replace')
ok = False
u2 = sanitize_unicode(u)
if u is not u2:
ok = False
return (ok, u2)
if __name__ == '__main__':
import doctest
doctest.testmod()
|