summaryrefslogtreecommitdiff
path: root/python/skytools/parsing.py
blob: bdcd1e386edefeb4d7948c5a8e72e1fedc3f916d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

"""Various parsers for Postgres-specific data formats."""

import re

from skytools.quoting import unescape, unquote_literal, unquote_ident
from skytools.sqltools import dbdict

__all__ = [
    "parse_pgarray", "parse_logtriga_sql", "parse_tabbed_table",
    "parse_statements", 'sql_tokenizer', 'parse_sqltriga_sql']

_rc_listelem = re.compile(r'( [^,"}]+ | ["] ( [^"\\]+ | [\\]. )* ["] )', re.X)

def parse_pgarray(array):
    r"""Parse Postgres array and return list of items inside it.

    Examples:
    >>> parse_pgarray('{}')
    []
    >>> parse_pgarray('{a,b,null,"null"}')
    ['a', 'b', None, 'null']
    >>> parse_pgarray(r'{"a,a","b\"b","c\\c"}')
    ['a,a', 'b"b', 'c\\c']
    >>> parse_pgarray("[0,3]={1,2,3}")
    ['1', '2', '3']
    """
    if not array or array[0] not in ("{", "[") or array[-1] != '}':
        raise Exception("bad array format: must be surrounded with {}")
    res = []
    pos = 1
    # skip optional dimensions descriptor "[a,b]={...}"
    if array[0] == "[":
        pos = array.find('{') + 1
        if pos < 1:
            raise Exception("bad array format: must be surrounded with {}")
    while 1:
        m = _rc_listelem.search(array, pos)
        if not m:
            break
        pos2 = m.end()
        item = array[pos:pos2]
        if len(item) == 4 and item.upper() == "NULL":
            val = None
        else:
            if len(item) > 0 and item[0] == '"':
                item = item[1:-1]
            val = unescape(item)
        res.append(val)

        pos = pos2 + 1
        if array[pos2] == "}":
            break
        elif array[pos2] != ",":
            raise Exception("bad array format: expected ,} got " + repr(array[pos2]))
    return res

#
# parse logtriga partial sql
#

class _logtriga_parser:
    """Parses logtriga/sqltriga partial SQL to values."""
    def tokenizer(self, sql):
        """Token generator."""
        for typ, tok in sql_tokenizer(sql, ignore_whitespace = True):
            yield tok

    def parse_insert(self, tk, fields, values):
        """Handler for inserts."""
        # (col1, col2) values ('data', null)
        if tk.next() != "(":
            raise Exception("syntax error")
        while 1:
            fields.append(tk.next())
            t = tk.next()
            if t == ")":
                break
            elif t != ",":
                raise Exception("syntax error")
        if tk.next().lower() != "values":
            raise Exception("syntax error, expected VALUES")
        if tk.next() != "(":
            raise Exception("syntax error, expected (")
        while 1:
            values.append(tk.next())
            t = tk.next()
            if t == ")":
                break
            if t == ",":
                continue
            raise Exception("expected , or ) got "+t)
        t = tk.next()
        raise Exception("expected EOF, got " + repr(t))

    def parse_update(self, tk, fields, values):
        """Handler for updates."""
        # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2'
        while 1:
            fields.append(tk.next())
            if tk.next() != "=":
                raise Exception("syntax error")
            values.append(tk.next())
            
            t = tk.next()
            if t == ",":
                continue
            elif t.lower() == "where":
                break
            else:
                raise Exception("syntax error, expected WHERE or , got "+repr(t))
        while 1:
            fld = tk.next()
            fields.append(fld)
            self.pklist.append(fld)
            if tk.next() != "=":
                raise Exception("syntax error")
            values.append(tk.next())
            t = tk.next()
            if t.lower() != "and":
                raise Exception("syntax error, expected AND got "+repr(t))

    def parse_delete(self, tk, fields, values):
        """Handler for deletes."""
        # pk1 = 'pk1' and pk2 = 'pk2'
        while 1:
            fld = tk.next()
            fields.append(fld)
            self.pklist.append(fld)
            if tk.next() != "=":
                raise Exception("syntax error")
            values.append(tk.next())
            t = tk.next()
            if t.lower() != "and":
                raise Exception("syntax error, expected AND, got "+repr(t))

    def parse_sql(self, op, sql, pklist=None):
        """Main entry point."""
        if pklist is None:
            self.pklist = []
        else:
            self.pklist = pklist
        tk = self.tokenizer(sql)
        fields = []
        values = []
        try:
            if op == "I":
                self.parse_insert(tk, fields, values)
            elif op == "U":
                self.parse_update(tk, fields, values)
            elif op == "D":
                self.parse_delete(tk, fields, values)
            raise Exception("syntax error")
        except StopIteration:
            # last sanity check
            if len(fields) == 0 or len(fields) != len(values):
                raise Exception("syntax error, fields do not match values")
        fields = [unquote_ident(f) for f in fields]
        values = [unquote_literal(f) for f in values]
        return dbdict(zip(fields, values))

def parse_logtriga_sql(op, sql):
    return parse_sqltriga_sql(op, sql)

def parse_sqltriga_sql(op, sql, pklist=None):
    """Parse partial SQL used by pgq.sqltriga() back to data values.

    Parser has following limitations:
     - Expects standard_quoted_strings = off
     - Does not support dollar quoting.
     - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1))
     - WHERE expression must not contain IS (NOT) NULL
     - Does not support updating pk value.

    Returns dict of col->data pairs.

    Insert event:
    >>> parse_logtriga_sql('I', '(id, data) values (1, null)')
    {'data': None, 'id': '1'}

    Update event:
    >>> parse_logtriga_sql('U', "data='foo' where id = 1")
    {'data': 'foo', 'id': '1'}

    Delete event:
    >>> parse_logtriga_sql('D', "id = 1 and id2 = 'str''val'")
    {'id2': "str'val", 'id': '1'}
    """
    return _logtriga_parser().parse_sql(op, sql, pklist)


def parse_tabbed_table(txt):
    r"""Parse a tab-separated table into list of dicts.
    
    Expect first row to be column names.

    Very primitive.

    Example:
    >>> parse_tabbed_table('col1\tcol2\nval1\tval2\n')
    [{'col2': 'val2', 'col1': 'val1'}]
    """

    txt = txt.replace("\r\n", "\n")
    fields = None
    data = []
    for ln in txt.split("\n"):
        if not ln:
            continue
        if not fields:
            fields = ln.split("\t")
            continue
        cols = ln.split("\t")
        if len(cols) != len(fields):
            continue
        row = dict(zip(fields, cols))
        data.append(row)
    return data


_extstr = r""" ['] (?: [^'\\]+ | \\. | [']['] )* ['] """
_stdstr = r""" ['] (?: [^']+ | [']['] )* ['] """
_base_sql = r"""
      (?P<ident>  [a-z][a-z0-9_$]* | ["] (?: [^"]+ | ["]["] )* ["] )
    | (?P<dolq>   (?P<dname> [$] (?: [_a-z][_a-z0-9]*)? [$] )
                  .*?
                  (?P=dname) )
    | (?P<num>    [0-9][0-9.e]* )
    | (?P<numarg> [$] [0-9]+ )
    | (?P<pyold>  [%][(] [a-z0-9_]+ [)][s] | [%][%] )
    | (?P<pynew>  [{] [^}]+ [}] | [{][{] | [}] [}] )
    | (?P<ws>     (?: \s+ | [/][*] .*? [*][/] | [-][-][^\n]* )+ )
    | (?P<error>  ['"$\\] )
    | (?P<sym>    . )"""
_std_sql = r"""(?: (?P<str> [E] %s | %s ) | %s )""" % (_extstr, _stdstr, _base_sql)
_ext_sql = r"""(?: (?P<str> [E]? %s ) | %s )""" % (_extstr, _base_sql)
_std_sql_rc = _ext_sql_rc = None

def sql_tokenizer(sql, standard_quoting = False, ignore_whitespace = False):
    r"""Parser SQL to tokens.

    Iterator, returns (toktype, tokstr) tuples.

    Example
    >>> [x for x in sql_tokenizer("select * from a.b", ignore_whitespace=True)]
    [('ident', 'select'), ('sym', '*'), ('ident', 'from'), ('ident', 'a'), ('sym', '.'), ('ident', 'b')]
    >>> [x for x in sql_tokenizer("\"c olumn\",'str''val'")]
    [('ident', '"c olumn"'), ('sym', ','), ('str', "'str''val'")]
    """
    global _std_sql_rc, _ext_sql_rc
    if not _std_sql_rc:
        _std_sql_rc = re.compile(_std_sql, re.X | re.I | re.S)
        _ext_sql_rc = re.compile(_ext_sql, re.X | re.I | re.S)

    if standard_quoting:
        rc = _std_sql_rc
    else:
        rc = _ext_sql_rc

    pos = 0
    while 1:
        m = rc.match(sql, pos)
        if not m:
            break
        pos = m.end()
        typ = m.lastgroup
        if not ignore_whitespace or typ != "ws":
            yield (m.lastgroup, m.group())

_copy_from_stdin_re = "copy.*from\s+stdin"
_copy_from_stdin_rc = None
def parse_statements(sql, standard_quoting = False):
    """Parse multi-statement string into separate statements.

    Returns list of statements.

    >>> [sql for sql in parse_statements("begin; select 1; select 'foo'; end;")]
    ['begin;', 'select 1;', "select 'foo';", 'end;']
    """

    global _copy_from_stdin_rc
    if not _copy_from_stdin_rc:
        _copy_from_stdin_rc = re.compile(_copy_from_stdin_re, re.X | re.I)
    tokens = []
    pcount = 0 # '(' level
    for typ, t in sql_tokenizer(sql, standard_quoting = standard_quoting):
        # skip whitespace and comments before statement
        if len(tokens) == 0 and typ == "ws":
            continue
        # keep the rest
        tokens.append(t)
        if t == "(":
            pcount += 1
        elif t == ")":
            pcount -= 1
        elif t == ";" and pcount == 0:
            sql = "".join(tokens)
            if _copy_from_stdin_rc.match(sql):
                raise Exception("copy from stdin not supported")
            yield ("".join(tokens))
            tokens = []
    if len(tokens) > 0:
        yield ("".join(tokens))
    if pcount != 0:
        raise Exception("syntax error - unbalanced parenthesis")

if __name__ == '__main__':
    import doctest
    doctest.testmod()