diff options
author | Marko Kreen | 2008-02-28 09:27:25 +0000 |
---|---|---|
committer | Marko Kreen | 2008-02-28 09:27:25 +0000 |
commit | 012e25634c81aaa5ad32084d4d588ec43fd5c838 (patch) | |
tree | 5f9f8cdb66b37cc079fab43f7232c4b59cad3e07 /python/skytools/parsing.py | |
parent | 8408b4ae8498b96695e4c730054beb092a2c7967 (diff) |
bring new quoting & parsing code to head
Diffstat (limited to 'python/skytools/parsing.py')
-rw-r--r-- | python/skytools/parsing.py | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/python/skytools/parsing.py b/python/skytools/parsing.py new file mode 100644 index 00000000..1f4dd781 --- /dev/null +++ b/python/skytools/parsing.py @@ -0,0 +1,272 @@ + +"""Various parsers for Postgres-specific data formats.""" + +import re + +from skytools.quoting import unescape + +__all__ = ["parse_pgarray", "parse_logtriga_sql", "parse_tabbed_table", "parse_statements"] + +_rc_listelem = re.compile(r'( [^,"}]+ | ["] ( [^"\\]+ | [\\]. )* ["] )', re.X) + +# _parse_pgarray +def parse_pgarray(array): + """ Parse Postgres array and return list of items inside it + Used to deserialize data recived from service layer parameters + """ + if not array or array[0] != "{": + raise Exception("bad array format: must start with {") + res = [] + pos = 1 + while 1: + m = _rc_listelem.search(array, pos) + if not m: + break + pos2 = m.end() + item = array[pos:pos2] + if len(item) > 0 and item[0] == '"': + item = item[1:-1] + item = unescape(item) + res.append(item) + + pos = pos2 + 1 + if array[pos2] == "}": + break + elif array[pos2] != ",": + raise Exception("bad array format: expected ,} got " + array[pos2]) + return res + +# +# parse logtriga partial sql +# + +class _logtriga_parser: + token_re = r""" + [ \t\r\n]* + ( [a-z][a-z0-9_]* + | ["] ( [^"\\]+ | \\. )* ["] + | ['] ( [^'\\]+ | \\. | [']['] )* ['] + | [^ \t\r\n] + )""" + token_rc = None + + def tokenizer(self, sql): + if not _logtriga_parser.token_rc: + _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I) + rc = self.token_rc + + pos = 0 + while 1: + m = rc.match(sql, pos) + if not m: + break + pos = m.end() + yield m.group(1) + + def unquote_data(self, fields, values): + # unquote data and column names + data = {} + for k, v in zip(fields, values): + if k[0] == '"': + k = unescape(k[1:-1]) + if len(v) == 4 and v.lower() == "null": + v = None + elif v[0] == "'": + v = unescape(v[1:-1]) + data[k] = v + return data + + def parse_insert(self, tk, fields, values): + # (col1, col2) values ('data', null) + if tk.next() != "(": + raise Exception("syntax error") + while 1: + fields.append(tk.next()) + t = tk.next() + if t == ")": + break + elif t != ",": + raise Exception("syntax error") + if tk.next().lower() != "values": + raise Exception("syntax error") + if tk.next() != "(": + raise Exception("syntax error") + while 1: + t = tk.next() + if t == ")": + break + if t == ",": + continue + values.append(t) + tk.next() + + def parse_update(self, tk, fields, values): + # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2' + while 1: + fields.append(tk.next()) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + + t = tk.next() + if t == ",": + continue + elif t.lower() == "where": + break + else: + raise Exception("syntax error") + while 1: + t = tk.next() + fields.append(t) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + t = tk.next() + if t.lower() != "and": + raise Exception("syntax error") + + def parse_delete(self, tk, fields, values): + # pk1 = 'pk1' and pk2 = 'pk2' + while 1: + t = tk.next() + if t == "and": + continue + fields.append(t) + if tk.next() != "=": + raise Exception("syntax error") + values.append(tk.next()) + + def parse_sql(self, op, sql): + tk = self.tokenizer(sql) + fields = [] + values = [] + try: + if op == "I": + self.parse_insert(tk, fields, values) + elif op == "U": + self.parse_update(tk, fields, values) + elif op == "D": + self.parse_delete(tk, fields, values) + raise Exception("syntax error") + except StopIteration: + # last sanity check + if len(fields) == 0 or len(fields) != len(values): + raise Exception("syntax error") + + return self.unquote_data(fields, values) + +def parse_logtriga_sql(op, sql): + """Parse partial SQL used by logtriga() back to data values. + + Parser has following limitations: + - Expects standard_quoted_strings = off + - Does not support dollar quoting. + - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1)) + - WHERE expression must not contain IS (NOT) NULL + - Does not support updateing pk value. + + Returns dict of col->data pairs. + """ + return _logtriga_parser().parse_sql(op, sql) + + +def parse_tabbed_table(txt): + """Parse a tab-separated table into list of dicts. + + Expect first row to be column names. + + Very primitive. + """ + + txt = txt.replace("\r\n", "\n") + fields = None + data = [] + for ln in txt.split("\n"): + if not ln: + continue + if not fields: + fields = ln.split("\t") + continue + cols = ln.split("\t") + if len(cols) != len(fields): + continue + row = dict(zip(fields, cols)) + data.append(row) + return data + + +_sql_token_re = r""" + ( [a-z][a-z0-9_$]* + | ["] ( [^"\\]+ | \\. )* ["] + | ['] ( [^'\\]+ | \\. | [']['] )* ['] + | [$] ([_a-z][_a-z0-9]*)? [$] + | (?P<ws> \s+ | [/][*] | [-][-][^\n]* ) + | . + )""" +_sql_token_rc = None +_copy_from_stdin_re = "copy.*from\s+stdin" +_copy_from_stdin_rc = None + +def _sql_tokenizer(sql): + global _sql_token_rc, _copy_from_stdin_rc + if not _sql_token_rc: + _sql_token_rc = re.compile(_sql_token_re, re.X | re.I) + _copy_from_stdin_rc = re.compile(_copy_from_stdin_re, re.X | re.I) + rc = _sql_token_rc + + pos = 0 + while 1: + m = rc.match(sql, pos) + if not m: + break + pos = m.end() + tok = m.group(1) + ws = m.start('ws') >= 0 # it tok empty? + if tok == "/*": + end = sql.find("*/", pos) + if end < 0: + raise Exception("unterminated c comment") + pos = end + 2 + tok = sql[ m.start() : pos] + elif len(tok) > 1 and tok[0] == "$" and tok[-1] == "$": + end = sql.find(tok, pos) + if end < 0: + raise Exception("unterminated dollar string") + pos = end + len(tok) + tok = sql[ m.start() : pos] + yield (ws, tok) + +def parse_statements(sql): + """Parse multi-statement string into separate statements. + + Returns list of statements. + """ + + tk = _sql_tokenizer(sql) + tokens = [] + pcount = 0 # '(' level + while 1: + try: + ws, t = tk.next() + except StopIteration: + break + # skip whitespace and comments before statement + if len(tokens) == 0 and ws: + continue + # keep the rest + tokens.append(t) + if t == "(": + pcount += 1 + elif t == ")": + pcount -= 1 + elif t == ";" and pcount == 0: + sql = "".join(tokens) + if _copy_from_stdin_rc.match(sql): + raise Exception("copy from stdin not supported") + yield ("".join(tokens)) + tokens = [] + if len(tokens) > 0: + yield ("".join(tokens)) + if pcount != 0: + raise Exception("syntax error - unbalanced parenthesis") + |