bring new quoting & parsing code to head

author: Marko Kreen 2008-02-28 09:27:25 +0000
committer: Marko Kreen 2008-02-28 09:27:25 +0000
commit: 012e25634c81aaa5ad32084d4d588ec43fd5c838 (patch)
tree: 5f9f8cdb66b37cc079fab43f7232c4b59cad3e07 /python/skytools
parent: 8408b4ae8498b96695e4c730054beb092a2c7967 (diff)
4 files changed, 435 insertions, 239 deletions
diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py
index 7b7dd126..89884095 100644
--- a/python/skytools/__init__.py
+++ b/python/skytools/__init__.py
@@ -9,6 +9,7 @@ from gzlog import *
 from scripting import *
 from sqltools import *
 from quoting import *
+from parsing import *
 
 __all__ = (psycopgwrapper.__all__
         + config.__all__
@@ -16,5 +17,6 @@ __all__ = (psycopgwrapper.__all__
         + gzlog.__all__
         + scripting.__all__
         + sqltools.__all__
+        + parsing.__all__
         + quoting.__all__ )
 
diff --git a/python/skytools/_pyquoting.py b/python/skytools/_pyquoting.py
new file mode 100644
index 00000000..28a57577
--- /dev/null
+++ b/python/skytools/_pyquoting.py
@@ -0,0 +1,153 @@
+# _pyquoting.py
+
+"""Various helpers for string quoting/unquoting.
+
+Here is pure Python that should match C code in _cquoting.
+"""
+
+import urllib, re
+
+__all__ = [
+    "quote_literal", "quote_copy", "quote_bytea_raw",
+    "db_urlencode", "db_urldecode", "unescape",
+]
+
+# 
+# SQL quoting
+#
+
+def quote_literal(s):
+    """Quote a literal value for SQL.
+
+    If string contains '\\', it is quoted and result is prefixed with E.
+    Input value of None results in string "null" without quotes.
+
+    Python implementation.
+    """
+
+    if s == None:
+        return "null"
+    s = str(s).replace("'", "''")
+    s2 = s.replace("\\", "\\\\")
+    if len(s) != len(s2):
+        return "E'" + s2 + "'"
+    return "'" + s2 + "'"
+
+def quote_copy(s):
+    """Quoting for copy command.  None is converted to \\N.
+    
+    Python implementation.
+    """
+
+    if s == None:
+        return "\\N"
+    s = str(s)
+    s = s.replace("\\", "\\\\")
+    s = s.replace("\t", "\\t")
+    s = s.replace("\n", "\\n")
+    s = s.replace("\r", "\\r")
+    return s
+
+_bytea_map = None
+def quote_bytea_raw(s):
+    """Quoting for bytea parser.  Returns None as None.
+    
+    Python implementation.
+    """
+    global _bytea_map
+    if s == None:
+        return None
+    if 1 and _bytea_map is None:
+        _bytea_map = {}
+        for i in xrange(256):
+            c = chr(i)
+            if i < 0x20 or i >= 0x7F:
+                _bytea_map[c] = "\\%03o" % i
+            elif c == "\\":
+                _bytea_map[c] = r"\\"
+            else:
+                _bytea_map[c] = c
+    return "".join([_bytea_map[c] for c in s])
+    # faster but does not match c code
+    #return s.replace("\\", "\\\\").replace("\0", "\\000")
+
+#
+# Database specific urlencode and urldecode.
+#
+
+def db_urlencode(dict):
+    """Database specific urlencode.
+
+    Encode None as key without '='.  That means that in "foo&bar=",
+    foo is NULL and bar is empty string.
+
+    Python implementation.
+    """
+
+    elem_list = []
+    for k, v in dict.items():
+        if v is None:
+            elem = urllib.quote_plus(str(k))
+        else:
+            elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v))
+        elem_list.append(elem)
+    return '&'.join(elem_list)
+
+def db_urldecode(qs):
+    """Database specific urldecode.
+
+    Decode key without '=' as None.
+    This also does not support one key several times.
+
+    Python implementation.
+    """
+
+    res = {}
+    for elem in qs.split('&'):
+        if not elem:
+            continue
+        pair = elem.split('=', 1)
+        name = urllib.unquote_plus(pair[0])
+
+        # keep only one instance around
+        name = intern(str(name))
+
+        if len(pair) == 1:
+            res[name] = None
+        else:
+            res[name] = urllib.unquote_plus(pair[1])
+    return res
+
+#
+# Remove C-like backslash escapes
+#
+
+_esc_re = r"\\([0-7]{1,3}|.)"
+_esc_rc = re.compile(_esc_re)
+_esc_map = {
+    't': '\t',
+    'n': '\n',
+    'r': '\r',
+    'a': '\a',
+    'b': '\b',
+    "'": "'",
+    '"': '"',
+    '\\': '\\',
+}
+
+def _sub_unescape(m):
+    v = m.group(1)
+    if (len(v) == 1) and (v < '0' or v > '7'):
+        try:
+            return _esc_map[v]
+        except KeyError:
+            return v
+    else:
+        return chr(int(v, 8))
+
+def unescape(val):
+    """Removes C-style escapes from string.
+    Python implementation.
+    """
+    return _esc_rc.sub(_sub_unescape, val)
+
diff --git a/python/skytools/parsing.py b/python/skytools/parsing.py
new file mode 100644
index 00000000..1f4dd781
--- /dev/null
+++ b/python/skytools/parsing.py
@@ -0,0 +1,272 @@
+
+"""Various parsers for Postgres-specific data formats."""
+
+import re
+
+from skytools.quoting import unescape
+
+__all__ = ["parse_pgarray", "parse_logtriga_sql", "parse_tabbed_table", "parse_statements"]
+
+_rc_listelem = re.compile(r'( [^,"}]+ | ["] ( [^"\\]+ | [\\]. )* ["] )', re.X)
+
+# _parse_pgarray
+def parse_pgarray(array):
+    """ Parse Postgres array and return list of items inside it
+        Used to deserialize data recived from service layer parameters
+    """
+    if not array or array[0] != "{":
+        raise Exception("bad array format: must start with {")
+    res = []
+    pos = 1
+    while 1:
+        m = _rc_listelem.search(array, pos)
+        if not m:
+            break
+        pos2 = m.end()
+        item = array[pos:pos2]
+        if len(item) > 0 and item[0] == '"':
+            item = item[1:-1]
+        item = unescape(item)
+        res.append(item)
+
+        pos = pos2 + 1
+        if array[pos2] == "}":
+            break
+        elif array[pos2] != ",":
+            raise Exception("bad array format: expected ,} got " + array[pos2])
+    return res
+
+#
+# parse logtriga partial sql
+#
+
+class _logtriga_parser:
+    token_re = r"""
+        [ \t\r\n]*
+        ( [a-z][a-z0-9_]*
+        | ["] ( [^"\\]+ | \\. )* ["]
+        | ['] ( [^'\\]+ | \\. | [']['] )* [']
+        | [^ \t\r\n]
+        )"""
+    token_rc = None
+
+    def tokenizer(self, sql):
+        if not _logtriga_parser.token_rc:
+            _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I)
+        rc = self.token_rc
+
+        pos = 0
+        while 1:
+            m = rc.match(sql, pos)
+            if not m:
+                break
+            pos = m.end()
+            yield m.group(1)
+
+    def unquote_data(self, fields, values):
+        # unquote data and column names
+        data = {}
+        for k, v in zip(fields, values):
+            if k[0] == '"':
+                k = unescape(k[1:-1])
+            if len(v) == 4 and v.lower() == "null":
+                v = None
+            elif v[0] == "'":
+                v = unescape(v[1:-1])
+            data[k] = v
+        return data
+
+    def parse_insert(self, tk, fields, values):
+        # (col1, col2) values ('data', null)
+        if tk.next() != "(":
+            raise Exception("syntax error")
+        while 1:
+            fields.append(tk.next())
+            t = tk.next()
+            if t == ")":
+                break
+            elif t != ",":
+                raise Exception("syntax error")
+        if tk.next().lower() != "values":
+            raise Exception("syntax error")
+        if tk.next() != "(":
+            raise Exception("syntax error")
+        while 1:
+            t = tk.next()
+            if t == ")":
+                break
+            if t == ",":
+                continue
+            values.append(t)
+        tk.next()
+
+    def parse_update(self, tk, fields, values):
+        # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2'
+        while 1:
+            fields.append(tk.next())
+            if tk.next() != "=":
+                raise Exception("syntax error")
+            values.append(tk.next())
+            
+            t = tk.next()
+            if t == ",":
+                continue
+            elif t.lower() == "where":
+                break
+            else:
+                raise Exception("syntax error")
+        while 1:
+            t = tk.next()
+            fields.append(t)
+            if tk.next() != "=":
+                raise Exception("syntax error")
+            values.append(tk.next())
+            t = tk.next()
+            if t.lower() != "and":
+                raise Exception("syntax error")
+
+    def parse_delete(self, tk, fields, values):
+        # pk1 = 'pk1' and pk2 = 'pk2'
+        while 1:
+            t = tk.next()
+            if t == "and":
+                continue
+            fields.append(t)
+            if tk.next() != "=":
+                raise Exception("syntax error")
+            values.append(tk.next())
+
+    def parse_sql(self, op, sql):
+        tk = self.tokenizer(sql)
+        fields = []
+        values = []
+        try:
+            if op == "I":
+                self.parse_insert(tk, fields, values)
+            elif op == "U":
+                self.parse_update(tk, fields, values)
+            elif op == "D":
+                self.parse_delete(tk, fields, values)
+            raise Exception("syntax error")
+        except StopIteration:
+            # last sanity check
+            if len(fields) == 0 or len(fields) != len(values):
+                raise Exception("syntax error")
+
+        return self.unquote_data(fields, values)
+
+def parse_logtriga_sql(op, sql):
+    """Parse partial SQL used by logtriga() back to data values.
+
+    Parser has following limitations:
+    - Expects standard_quoted_strings = off
+    - Does not support dollar quoting.
+    - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1))
+    - WHERE expression must not contain IS (NOT) NULL
+    - Does not support updateing pk value.
+
+    Returns dict of col->data pairs.
+    """
+    return _logtriga_parser().parse_sql(op, sql)
+
+
+def parse_tabbed_table(txt):
+    """Parse a tab-separated table into list of dicts.
+    
+    Expect first row to be column names.
+
+    Very primitive.
+    """
+
+    txt = txt.replace("\r\n", "\n")
+    fields = None
+    data = []
+    for ln in txt.split("\n"):
+        if not ln:
+            continue
+        if not fields:
+            fields = ln.split("\t")
+            continue
+        cols = ln.split("\t")
+        if len(cols) != len(fields):
+            continue
+        row = dict(zip(fields, cols))
+        data.append(row)
+    return data
+
+
+_sql_token_re = r"""
+    ( [a-z][a-z0-9_$]*
+    | ["] ( [^"\\]+ | \\. )* ["]
+    | ['] ( [^'\\]+ | \\. | [']['] )* [']
+    | [$] ([_a-z][_a-z0-9]*)? [$]
+    | (?P<ws> \s+ | [/][*] | [-][-][^\n]* )
+    | .
+    )"""
+_sql_token_rc = None
+_copy_from_stdin_re = "copy.*from\s+stdin"
+_copy_from_stdin_rc = None
+
+def _sql_tokenizer(sql):
+    global _sql_token_rc, _copy_from_stdin_rc
+    if not _sql_token_rc:
+        _sql_token_rc = re.compile(_sql_token_re, re.X | re.I)
+        _copy_from_stdin_rc = re.compile(_copy_from_stdin_re, re.X | re.I)
+    rc = _sql_token_rc
+
+    pos = 0
+    while 1:
+        m = rc.match(sql, pos)
+        if not m:
+            break
+        pos = m.end()
+        tok = m.group(1)
+        ws = m.start('ws') >= 0 # it tok empty?
+        if tok == "/*":
+            end = sql.find("*/", pos)
+            if end < 0:
+                raise Exception("unterminated c comment")
+            pos = end + 2
+            tok = sql[ m.start() : pos]
+        elif len(tok) > 1 and tok[0] == "$" and tok[-1] == "$":
+            end = sql.find(tok, pos)
+            if end < 0:
+                raise Exception("unterminated dollar string")
+            pos = end + len(tok)
+            tok = sql[ m.start() : pos]
+        yield (ws, tok)
+
+def parse_statements(sql):
+    """Parse multi-statement string into separate statements.
+
+    Returns list of statements.
+    """
+
+    tk = _sql_tokenizer(sql)
+    tokens = []
+    pcount = 0 # '(' level
+    while 1:
+        try:
+            ws, t = tk.next()
+        except StopIteration:
+            break
+        # skip whitespace and comments before statement
+        if len(tokens) == 0 and ws:
+            continue
+        # keep the rest
+        tokens.append(t)
+        if t == "(":
+            pcount += 1
+        elif t == ")":
+            pcount -= 1
+        elif t == ";" and pcount == 0:
+            sql = "".join(tokens)
+            if _copy_from_stdin_rc.match(sql):
+                raise Exception("copy from stdin not supported")
+            yield ("".join(tokens))
+            tokens = []
+    if len(tokens) > 0:
+        yield ("".join(tokens))
+    if pcount != 0:
+        raise Exception("syntax error - unbalanced parenthesis")
+
diff --git a/python/skytools/quoting.py b/python/skytools/quoting.py
index 594646a4..10d4626a 100644
--- a/python/skytools/quoting.py
+++ b/python/skytools/quoting.py
@@ -4,49 +4,23 @@
 
 import urllib, re
 
-from skytools.psycopgwrapper import QuotedString
-
 __all__ = [
     "quote_literal", "quote_copy", "quote_bytea_raw",
+    "db_urlencode", "db_urldecode", "unescape",
+
     "quote_bytea_literal", "quote_bytea_copy", "quote_statement",
-    "quote_ident", "quote_fqident", "quote_json",
-    "db_urlencode", "db_urldecode", "unescape", "unescape_copy"
+    "quote_ident", "quote_fqident", "quote_json", "unescape_copy"
 ]
 
+try:
+    from _cquoting import *
+except ImportError:
+    from _pyquoting import *
+
 # 
 # SQL quoting
 #
 
-def quote_literal(s):
-    """Quote a literal value for SQL.
-    
-    Surronds it with single-quotes.
-    """
-
-    if s == None:
-        return "null"
-    s = QuotedString(str(s))
-    return str(s)
-
-def quote_copy(s):
-    """Quoting for copy command."""
-
-    if s == None:
-        return "\\N"
-    s = str(s)
-    s = s.replace("\\", "\\\\")
-    s = s.replace("\t", "\\t")
-    s = s.replace("\n", "\\n")
-    s = s.replace("\r", "\\r")
-    return s
-
-def quote_bytea_raw(s):
-    """Quoting for bytea parser."""
-
-    if s == None:
-        return None
-    return s.replace("\\", "\\\\").replace("\0", "\\000")
-
 def quote_bytea_literal(s):
     """Quote bytea for regular SQL."""
 
@@ -125,214 +99,9 @@ def quote_json(s):
         return "null"
     return '"%s"' % _jsre.sub(_json_quote_char, s)
 
-#
-# Database specific urlencode and urldecode.
-#
-
-def db_urlencode(dict):
-    """Database specific urlencode.
-
-    Encode None as key without '='.  That means that in "foo&bar=",
-    foo is NULL and bar is empty string.
-    """
-
-    elem_list = []
-    for k, v in dict.items():
-        if v is None:
-            elem = urllib.quote_plus(str(k))
-        else:
-            elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v))
-        elem_list.append(elem)
-    return '&'.join(elem_list)
-
-def db_urldecode(qs):
-    """Database specific urldecode.
-
-    Decode key without '=' as None.
-    This also does not support one key several times.
-    """
-
-    res = {}
-    for elem in qs.split('&'):
-        if not elem:
-            continue
-        pair = elem.split('=', 1)
-        name = urllib.unquote_plus(pair[0])
-
-        # keep only one instance around
-        name = intern(name)
-
-        if len(pair) == 1:
-            res[name] = None
-        else:
-            res[name] = urllib.unquote_plus(pair[1])
-    return res
-
-#
-# Remove C-like backslash escapes
-#
-
-_esc_re = r"\\([0-7][0-7][0-7]|.)"
-_esc_rc = re.compile(_esc_re)
-_esc_map = {
-    't': '\t',
-    'n': '\n',
-    'r': '\r',
-    'a': '\a',
-    'b': '\b',
-    "'": "'",
-    '"': '"',
-    '\\': '\\',
-}
-
-def _sub_unescape(m):
-    v = m.group(1)
-    if len(v) == 1:
-        return _esc_map[v]
-    else:
-        return chr(int(v, 8))
-
-def unescape(val):
-    """Removes C-style escapes from string."""
-    return _esc_rc.sub(_sub_unescape, val)
-
 def unescape_copy(val):
     """Removes C-style escapes, also converts "\N" to None."""
     if val == r"\N":
         return None
     return unescape(val)
 
-
-#
-# parse logtriga partial sql
-#
-
-class _logtriga_parser:
-    token_re = r"""
-        [ \t\r\n]*
-        ( [a-z][a-z0-9_]*
-        | ["] ( [^"\\]+ | \\. )* ["]
-        | ['] ( [^'\\]+ | \\. | [']['] )* [']
-        | [^ \t\r\n]
-        )"""
-    token_rc = None
-
-    def tokenizer(self, sql):
-        if not _logtriga_parser.token_rc:
-            _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I)
-        rc = self.token_rc
-
-        pos = 0
-        while 1:
-            m = rc.match(sql, pos)
-            if not m:
-                break
-            pos = m.end()
-            yield m.group(1)
-
-    def unquote_data(self, fields, values):
-        # unquote data and column names
-        data = {}
-        for k, v in zip(fields, values):
-            if k[0] == '"':
-                k = unescape(k[1:-1])
-            if len(v) == 4 and v.lower() == "null":
-                v = None
-            elif v[0] == "'":
-                v = unescape(v[1:-1])
-            data[k] = v
-        return data
-
-    def parse_insert(self, tk, fields, values):
-        # (col1, col2) values ('data', null)
-        if tk.next() != "(":
-            raise Exception("syntax error")
-        while 1:
-            fields.append(tk.next())
-            t = tk.next()
-            if t == ")":
-                break
-            elif t != ",":
-                raise Exception("syntax error")
-        if tk.next().lower() != "values":
-            raise Exception("syntax error")
-        if tk.next() != "(":
-            raise Exception("syntax error")
-        while 1:
-            t = tk.next()
-            if t == ")":
-                break
-            if t == ",":
-                continue
-            values.append(t)
-        tk.next()
-
-    def parse_update(self, tk, fields, values):
-        # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2'
-        while 1:
-            fields.append(tk.next())
-            if tk.next() != "=":
-                raise Exception("syntax error")
-            values.append(tk.next())
-            
-            t = tk.next()
-            if t == ",":
-                continue
-            elif t.lower() == "where":
-                break
-            else:
-                raise Exception("syntax error")
-        while 1:
-            t = tk.next()
-            fields.append(t)
-            if tk.next() != "=":
-                raise Exception("syntax error")
-            values.append(tk.next())
-            t = tk.next()
-            if t.lower() != "and":
-                raise Exception("syntax error")
-
-    def parse_delete(self, tk, fields, values):
-        # pk1 = 'pk1' and pk2 = 'pk2'
-        while 1:
-            t = tk.next()
-            if t == "and":
-                continue
-            fields.append(t)
-            if tk.next() != "=":
-                raise Exception("syntax error")
-            values.append(tk.next())
-
-    def parse_sql(self, op, sql):
-        tk = self.tokenizer(sql)
-        fields = []
-        values = []
-        try:
-            if op == "I":
-                self.parse_insert(tk, fields, values)
-            elif op == "U":
-                self.parse_update(tk, fields, values)
-            elif op == "D":
-                self.parse_delete(tk, fields, values)
-            raise Exception("syntax error")
-        except StopIteration:
-            # last sanity check
-            if len(fields) == 0 or len(fields) != len(values):
-                raise Exception("syntax error")
-
-        return self.unquote_data(fields, values)
-
-def parse_logtriga_sql(op, sql):
-    """Parse partial SQL used by logtriga() back to data values.
-
-    Parser has following limitations:
-    - Expects standard_quoted_strings = off
-    - Does not support dollar quoting.
-    - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1))
-    - WHERE expression must not contain IS (NOT) NULL
-    - Does not support updateing pk value.
-
-    Returns dict of col->data pairs.
-    """
-    return _logtriga_parser().parse_sql(op, sql)
-
author	Marko Kreen	2008-02-28 09:27:25 +0000
committer	Marko Kreen	2008-02-28 09:27:25 +0000
commit	012e25634c81aaa5ad32084d4d588ec43fd5c838 (patch)
tree	5f9f8cdb66b37cc079fab43f7232c4b59cad3e07 /python/skytools
parent	8408b4ae8498b96695e4c730054beb092a2c7967 (diff)