diff options
| author | Marko Kreen | 2008-02-28 09:27:25 +0000 |
|---|---|---|
| committer | Marko Kreen | 2008-02-28 09:27:25 +0000 |
| commit | 012e25634c81aaa5ad32084d4d588ec43fd5c838 (patch) | |
| tree | 5f9f8cdb66b37cc079fab43f7232c4b59cad3e07 /python/skytools | |
| parent | 8408b4ae8498b96695e4c730054beb092a2c7967 (diff) | |
bring new quoting & parsing code to head
Diffstat (limited to 'python/skytools')
| -rw-r--r-- | python/skytools/__init__.py | 2 | ||||
| -rw-r--r-- | python/skytools/_pyquoting.py | 153 | ||||
| -rw-r--r-- | python/skytools/parsing.py | 272 | ||||
| -rw-r--r-- | python/skytools/quoting.py | 247 |
4 files changed, 435 insertions, 239 deletions
diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py index 7b7dd126..89884095 100644 --- a/python/skytools/__init__.py +++ b/python/skytools/__init__.py @@ -9,6 +9,7 @@ from gzlog import * from scripting import * from sqltools import * from quoting import * +from parsing import * __all__ = (psycopgwrapper.__all__ + config.__all__ @@ -16,5 +17,6 @@ __all__ = (psycopgwrapper.__all__ + gzlog.__all__ + scripting.__all__ + sqltools.__all__ + + parsing.__all__ + quoting.__all__ ) diff --git a/python/skytools/_pyquoting.py b/python/skytools/_pyquoting.py new file mode 100644 index 00000000..28a57577 --- /dev/null +++ b/python/skytools/_pyquoting.py @@ -0,0 +1,153 @@ +# _pyquoting.py + +"""Various helpers for string quoting/unquoting. + +Here is pure Python that should match C code in _cquoting. +""" + +import urllib, re + +__all__ = [ + "quote_literal", "quote_copy", "quote_bytea_raw", + "db_urlencode", "db_urldecode", "unescape", +] + +# +# SQL quoting +# + +def quote_literal(s): + """Quote a literal value for SQL. + + If string contains '\\', it is quoted and result is prefixed with E. + Input value of None results in string "null" without quotes. + + Python implementation. + """ + + if s == None: + return "null" + s = str(s).replace("'", "''") + s2 = s.replace("\\", "\\\\") + if len(s) != len(s2): + return "E'" + s2 + "'" + return "'" + s2 + "'" + +def quote_copy(s): + """Quoting for copy command. None is converted to \\N. + + Python implementation. + """ + + if s == None: + return "\\N" + s = str(s) + s = s.replace("\\", "\\\\") + s = s.replace("\t", "\\t") + s = s.replace("\n", "\\n") + s = s.replace("\r", "\\r") + return s + +_bytea_map = None +def quote_bytea_raw(s): + """Quoting for bytea parser. Returns None as None. + + Python implementation. + """ + global _bytea_map + if s == None: + return None + if 1 and _bytea_map is None: + _bytea_map = {} + for i in xrange(256): + c = chr(i) + if i < 0x20 or i >= 0x7F: + _bytea_map[c] = "\\%03o" % i + elif c == "\\": + _bytea_map[c] = r"\\" + else: + _bytea_map[c] = c + return "".join([_bytea_map[c] for c in s]) + # faster but does not match c code + #return s.replace("\\", "\\\\").replace("\0", "\\000") + +# +# Database specific urlencode and urldecode. +# + +def db_urlencode(dict): + """Database specific urlencode. + + Encode None as key without '='. That means that in "foo&bar=", + foo is NULL and bar is empty string. + + Python implementation. + """ + + elem_list = [] + for k, v in dict.items(): + if v is None: + elem = urllib.quote_plus(str(k)) + else: + elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v)) + elem_list.append(elem) + return '&'.join(elem_list) + +def db_urldecode(qs): + """Database specific urldecode. + + Decode key without '=' as None. + This also does not support one key several times. + + Python implementation. + """ + + res = {} + for elem in qs.split('&'): + if not elem: + continue + pair = elem.split('=', 1) + name = urllib.unquote_plus(pair[0]) + + # keep only one instance around + name = intern(str(name)) + + if len(pair) == 1: + res[name] = None + else: + res[name] = urllib.unquote_plus(pair[1]) + return res + +# +# Remove C-like backslash escapes +# + +_esc_re = r"\\([0-7]{1,3}|.)" +_esc_rc = re.compile(_esc_re) +_esc_map = { + 't': '\t', + 'n': '\n', + 'r': '\r', + 'a': '\a', + 'b': '\b', + "'": "'", + '"': '"', + '\\': '\\', +} + +def _sub_unescape(m): + v = m.group(1) + if (len(v) == 1) and (v < '0' or v > '7'): + try: + return _esc_map[v] + except KeyError: + return v + else: + return chr(int(v, 8)) + +def unescape(val): + """Removes C-style escapes from string. + Python implementation. + """ + return _esc_rc.sub(_sub_unescape, val) + diff --git a/python/skytools/parsing.py b/python/skytools/parsing.py new file mode 100644 index 00000000..1f4dd781 --- /dev/null +++ b/python/skytools/parsing.py @@ -0,0 +1,272 @@ + +"""Various parsers for Postgres-specific data formats.""" + +import re + +from skytools.quoting import unescape + +__all__ = ["parse_pgarray", "parse_logtriga_sql", "parse_tabbed_table", "parse_statements"] + +_rc_listelem = re.compile(r'( [^,"}]+ | ["] ( [^"\\]+ | [\\]. )* ["] )', re.X) + +# _parse_pgarray +def parse_pgarray(array): + """ Parse Postgres array and return list of items inside it + Used to deserialize data recived from service layer parameters + """ + if not array or array[0] != "{": + raise Exception("bad array format: must start with {") + res = [] + pos = 1 + while 1: + m = _rc_listelem.search(array, pos) + if not m: + break + pos2 = m.end() + item = array[pos:pos2] + if len(item) > 0 and item[0] == '"': + item = item[1:-1] + item = unescape(item) + res.append(item) + + pos = pos2 + 1 + if array[pos2] == "}": + break + elif array[pos2] != ",": + raise Exception("bad array format: expected ,} got " + array[pos2]) + return res + +# +# parse logtriga partial sql +# + +class _logtriga_parser: + token_re = r""" + [ \t\r\n]* + ( [a-z][a-z0-9_]* + | ["] ( [^"\\]+ | \\. )* ["] + | ['] ( [^'\\]+ | \\. | [']['] )* ['] + | [^ \t\r\n] + )""" + token_rc = None + + def tokenizer(self, sql): + if not _logtriga_parser.token_rc: + _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I) + rc = self.token_rc + + pos = 0 + while 1: + m = rc.match(sql, pos) + if not m: + break + pos = m.end() + yield m.group(1) + + def unquote_data(self, fields, values): + # unquote data and column names + data = {} + for k, v in zip(fields, values): + if k[0] == '"': + k = unescape(k[1:-1]) + if len(v) == 4 and v.lower() == "null": + v = None + elif v[0] == "'": + v = unescape(v[1:-1]) + data[k] = v + return data + + def parse_insert(self, tk, fields, values): + # (col1, col2) values ('data', null) + if tk.next() != "(": + raise Exception("syntax error") + while 1: + fields.append(tk.next()) + t = tk.next() + if t == ")": + break + elif t != ",": + raise Exception("syntax error") + if tk.next().lower() != "values": + raise Exception("syntax error") + if tk.next() != "(": + raise Exception("syntax error") + while 1: + t = tk.next() + if t == ")": + break + if t == ",": + continue + values.append(t) + tk.next() + + def parse_update(self, tk, fields, values): + # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2' + while 1: + fields.append(tk.next()) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + + t = tk.next() + if t == ",": + continue + elif t.lower() == "where": + break + else: + raise Exception("syntax error") + while 1: + t = tk.next() + fields.append(t) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + t = tk.next() + if t.lower() != "and": + raise Exception("syntax error") + + def parse_delete(self, tk, fields, values): + # pk1 = 'pk1' and pk2 = 'pk2' + while 1: + t = tk.next() + if t == "and": + continue + fields.append(t) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + + def parse_sql(self, op, sql): + tk = self.tokenizer(sql) + fields = [] + values = [] + try: + if op == "I": + self.parse_insert(tk, fields, values) + elif op == "U": + self.parse_update(tk, fields, values) + elif op == "D": + self.parse_delete(tk, fields, values) + raise Exception("syntax error") + except StopIteration: + # last sanity check + if len(fields) == 0 or len(fields) != len(values): + raise Exception("syntax error") + + return self.unquote_data(fields, values) + +def parse_logtriga_sql(op, sql): + """Parse partial SQL used by logtriga() back to data values. + + Parser has following limitations: + - Expects standard_quoted_strings = off + - Does not support dollar quoting. + - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1)) + - WHERE expression must not contain IS (NOT) NULL + - Does not support updateing pk value. + + Returns dict of col->data pairs. + """ + return _logtriga_parser().parse_sql(op, sql) + + +def parse_tabbed_table(txt): + """Parse a tab-separated table into list of dicts. + + Expect first row to be column names. + + Very primitive. + """ + + txt = txt.replace("\r\n", "\n") + fields = None + data = [] + for ln in txt.split("\n"): + if not ln: + continue + if not fields: + fields = ln.split("\t") + continue + cols = ln.split("\t") + if len(cols) != len(fields): + continue + row = dict(zip(fields, cols)) + data.append(row) + return data + + +_sql_token_re = r""" + ( [a-z][a-z0-9_$]* + | ["] ( [^"\\]+ | \\. )* ["] + | ['] ( [^'\\]+ | \\. | [']['] )* ['] + | [$] ([_a-z][_a-z0-9]*)? [$] + | (?P<ws> \s+ | [/][*] | [-][-][^\n]* ) + | . + )""" +_sql_token_rc = None +_copy_from_stdin_re = "copy.*from\s+stdin" +_copy_from_stdin_rc = None + +def _sql_tokenizer(sql): + global _sql_token_rc, _copy_from_stdin_rc + if not _sql_token_rc: + _sql_token_rc = re.compile(_sql_token_re, re.X | re.I) + _copy_from_stdin_rc = re.compile(_copy_from_stdin_re, re.X | re.I) + rc = _sql_token_rc + + pos = 0 + while 1: + m = rc.match(sql, pos) + if not m: + break + pos = m.end() + tok = m.group(1) + ws = m.start('ws') >= 0 # it tok empty? + if tok == "/*": + end = sql.find("*/", pos) + if end < 0: + raise Exception("unterminated c comment") + pos = end + 2 + tok = sql[ m.start() : pos] + elif len(tok) > 1 and tok[0] == "$" and tok[-1] == "$": + end = sql.find(tok, pos) + if end < 0: + raise Exception("unterminated dollar string") + pos = end + len(tok) + tok = sql[ m.start() : pos] + yield (ws, tok) + +def parse_statements(sql): + """Parse multi-statement string into separate statements. + + Returns list of statements. + """ + + tk = _sql_tokenizer(sql) + tokens = [] + pcount = 0 # '(' level + while 1: + try: + ws, t = tk.next() + except StopIteration: + break + # skip whitespace and comments before statement + if len(tokens) == 0 and ws: + continue + # keep the rest + tokens.append(t) + if t == "(": + pcount += 1 + elif t == ")": + pcount -= 1 + elif t == ";" and pcount == 0: + sql = "".join(tokens) + if _copy_from_stdin_rc.match(sql): + raise Exception("copy from stdin not supported") + yield ("".join(tokens)) + tokens = [] + if len(tokens) > 0: + yield ("".join(tokens)) + if pcount != 0: + raise Exception("syntax error - unbalanced parenthesis") + diff --git a/python/skytools/quoting.py b/python/skytools/quoting.py index 594646a4..10d4626a 100644 --- a/python/skytools/quoting.py +++ b/python/skytools/quoting.py @@ -4,49 +4,23 @@ import urllib, re -from skytools.psycopgwrapper import QuotedString - __all__ = [ "quote_literal", "quote_copy", "quote_bytea_raw", + "db_urlencode", "db_urldecode", "unescape", + "quote_bytea_literal", "quote_bytea_copy", "quote_statement", - "quote_ident", "quote_fqident", "quote_json", - "db_urlencode", "db_urldecode", "unescape", "unescape_copy" + "quote_ident", "quote_fqident", "quote_json", "unescape_copy" ] +try: + from _cquoting import * +except ImportError: + from _pyquoting import * + # # SQL quoting # -def quote_literal(s): - """Quote a literal value for SQL. - - Surronds it with single-quotes. - """ - - if s == None: - return "null" - s = QuotedString(str(s)) - return str(s) - -def quote_copy(s): - """Quoting for copy command.""" - - if s == None: - return "\\N" - s = str(s) - s = s.replace("\\", "\\\\") - s = s.replace("\t", "\\t") - s = s.replace("\n", "\\n") - s = s.replace("\r", "\\r") - return s - -def quote_bytea_raw(s): - """Quoting for bytea parser.""" - - if s == None: - return None - return s.replace("\\", "\\\\").replace("\0", "\\000") - def quote_bytea_literal(s): """Quote bytea for regular SQL.""" @@ -125,214 +99,9 @@ def quote_json(s): return "null" return '"%s"' % _jsre.sub(_json_quote_char, s) -# -# Database specific urlencode and urldecode. -# - -def db_urlencode(dict): - """Database specific urlencode. - - Encode None as key without '='. That means that in "foo&bar=", - foo is NULL and bar is empty string. - """ - - elem_list = [] - for k, v in dict.items(): - if v is None: - elem = urllib.quote_plus(str(k)) - else: - elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v)) - elem_list.append(elem) - return '&'.join(elem_list) - -def db_urldecode(qs): - """Database specific urldecode. - - Decode key without '=' as None. - This also does not support one key several times. - """ - - res = {} - for elem in qs.split('&'): - if not elem: - continue - pair = elem.split('=', 1) - name = urllib.unquote_plus(pair[0]) - - # keep only one instance around - name = intern(name) - - if len(pair) == 1: - res[name] = None - else: - res[name] = urllib.unquote_plus(pair[1]) - return res - -# -# Remove C-like backslash escapes -# - -_esc_re = r"\\([0-7][0-7][0-7]|.)" -_esc_rc = re.compile(_esc_re) -_esc_map = { - 't': '\t', - 'n': '\n', - 'r': '\r', - 'a': '\a', - 'b': '\b', - "'": "'", - '"': '"', - '\\': '\\', -} - -def _sub_unescape(m): - v = m.group(1) - if len(v) == 1: - return _esc_map[v] - else: - return chr(int(v, 8)) - -def unescape(val): - """Removes C-style escapes from string.""" - return _esc_rc.sub(_sub_unescape, val) - def unescape_copy(val): """Removes C-style escapes, also converts "\N" to None.""" if val == r"\N": return None return unescape(val) - -# -# parse logtriga partial sql -# - -class _logtriga_parser: - token_re = r""" - [ \t\r\n]* - ( [a-z][a-z0-9_]* - | ["] ( [^"\\]+ | \\. )* ["] - | ['] ( [^'\\]+ | \\. | [']['] )* ['] - | [^ \t\r\n] - )""" - token_rc = None - - def tokenizer(self, sql): - if not _logtriga_parser.token_rc: - _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I) - rc = self.token_rc - - pos = 0 - while 1: - m = rc.match(sql, pos) - if not m: - break - pos = m.end() - yield m.group(1) - - def unquote_data(self, fields, values): - # unquote data and column names - data = {} - for k, v in zip(fields, values): - if k[0] == '"': - k = unescape(k[1:-1]) - if len(v) == 4 and v.lower() == "null": - v = None - elif v[0] == "'": - v = unescape(v[1:-1]) - data[k] = v - return data - - def parse_insert(self, tk, fields, values): - # (col1, col2) values ('data', null) - if tk.next() != "(": - raise Exception("syntax error") - while 1: - fields.append(tk.next()) - t = tk.next() - if t == ")": - break - elif t != ",": - raise Exception("syntax error") - if tk.next().lower() != "values": - raise Exception("syntax error") - if tk.next() != "(": - raise Exception("syntax error") - while 1: - t = tk.next() - if t == ")": - break - if t == ",": - continue - values.append(t) - tk.next() - - def parse_update(self, tk, fields, values): - # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2' - while 1: - fields.append(tk.next()) - if tk.next() != "=": - raise Exception("syntax error") - values.append(tk.next()) - - t = tk.next() - if t == ",": - continue - elif t.lower() == "where": - break - else: - raise Exception("syntax error") - while 1: - t = tk.next() - fields.append(t) - if tk.next() != "=": - raise Exception("syntax error") - values.append(tk.next()) - t = tk.next() - if t.lower() != "and": - raise Exception("syntax error") - - def parse_delete(self, tk, fields, values): - # pk1 = 'pk1' and pk2 = 'pk2' - while 1: - t = tk.next() - if t == "and": - continue - fields.append(t) - if tk.next() != "=": - raise Exception("syntax error") - values.append(tk.next()) - - def parse_sql(self, op, sql): - tk = self.tokenizer(sql) - fields = [] - values = [] - try: - if op == "I": - self.parse_insert(tk, fields, values) - elif op == "U": - self.parse_update(tk, fields, values) - elif op == "D": - self.parse_delete(tk, fields, values) - raise Exception("syntax error") - except StopIteration: - # last sanity check - if len(fields) == 0 or len(fields) != len(values): - raise Exception("syntax error") - - return self.unquote_data(fields, values) - -def parse_logtriga_sql(op, sql): - """Parse partial SQL used by logtriga() back to data values. - - Parser has following limitations: - - Expects standard_quoted_strings = off - - Does not support dollar quoting. - - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1)) - - WHERE expression must not contain IS (NOT) NULL - - Does not support updateing pk value. - - Returns dict of col->data pairs. - """ - return _logtriga_parser().parse_sql(op, sql) - |
