summaryrefslogtreecommitdiff
path: root/python/skytools
diff options
context:
space:
mode:
authorMarko Kreen2008-02-28 09:27:25 +0000
committerMarko Kreen2008-02-28 09:27:25 +0000
commit012e25634c81aaa5ad32084d4d588ec43fd5c838 (patch)
tree5f9f8cdb66b37cc079fab43f7232c4b59cad3e07 /python/skytools
parent8408b4ae8498b96695e4c730054beb092a2c7967 (diff)
bring new quoting & parsing code to head
Diffstat (limited to 'python/skytools')
-rw-r--r--python/skytools/__init__.py2
-rw-r--r--python/skytools/_pyquoting.py153
-rw-r--r--python/skytools/parsing.py272
-rw-r--r--python/skytools/quoting.py247
4 files changed, 435 insertions, 239 deletions
diff --git a/python/skytools/__init__.py b/python/skytools/__init__.py
index 7b7dd126..89884095 100644
--- a/python/skytools/__init__.py
+++ b/python/skytools/__init__.py
@@ -9,6 +9,7 @@ from gzlog import *
from scripting import *
from sqltools import *
from quoting import *
+from parsing import *
__all__ = (psycopgwrapper.__all__
+ config.__all__
@@ -16,5 +17,6 @@ __all__ = (psycopgwrapper.__all__
+ gzlog.__all__
+ scripting.__all__
+ sqltools.__all__
+ + parsing.__all__
+ quoting.__all__ )
diff --git a/python/skytools/_pyquoting.py b/python/skytools/_pyquoting.py
new file mode 100644
index 00000000..28a57577
--- /dev/null
+++ b/python/skytools/_pyquoting.py
@@ -0,0 +1,153 @@
+# _pyquoting.py
+
+"""Various helpers for string quoting/unquoting.
+
+Here is pure Python that should match C code in _cquoting.
+"""
+
+import urllib, re
+
+__all__ = [
+ "quote_literal", "quote_copy", "quote_bytea_raw",
+ "db_urlencode", "db_urldecode", "unescape",
+]
+
+#
+# SQL quoting
+#
+
+def quote_literal(s):
+ """Quote a literal value for SQL.
+
+ If string contains '\\', it is quoted and result is prefixed with E.
+ Input value of None results in string "null" without quotes.
+
+ Python implementation.
+ """
+
+ if s == None:
+ return "null"
+ s = str(s).replace("'", "''")
+ s2 = s.replace("\\", "\\\\")
+ if len(s) != len(s2):
+ return "E'" + s2 + "'"
+ return "'" + s2 + "'"
+
+def quote_copy(s):
+ """Quoting for copy command. None is converted to \\N.
+
+ Python implementation.
+ """
+
+ if s == None:
+ return "\\N"
+ s = str(s)
+ s = s.replace("\\", "\\\\")
+ s = s.replace("\t", "\\t")
+ s = s.replace("\n", "\\n")
+ s = s.replace("\r", "\\r")
+ return s
+
+_bytea_map = None
+def quote_bytea_raw(s):
+ """Quoting for bytea parser. Returns None as None.
+
+ Python implementation.
+ """
+ global _bytea_map
+ if s == None:
+ return None
+ if 1 and _bytea_map is None:
+ _bytea_map = {}
+ for i in xrange(256):
+ c = chr(i)
+ if i < 0x20 or i >= 0x7F:
+ _bytea_map[c] = "\\%03o" % i
+ elif c == "\\":
+ _bytea_map[c] = r"\\"
+ else:
+ _bytea_map[c] = c
+ return "".join([_bytea_map[c] for c in s])
+ # faster but does not match c code
+ #return s.replace("\\", "\\\\").replace("\0", "\\000")
+
+#
+# Database specific urlencode and urldecode.
+#
+
+def db_urlencode(dict):
+ """Database specific urlencode.
+
+ Encode None as key without '='. That means that in "foo&bar=",
+ foo is NULL and bar is empty string.
+
+ Python implementation.
+ """
+
+ elem_list = []
+ for k, v in dict.items():
+ if v is None:
+ elem = urllib.quote_plus(str(k))
+ else:
+ elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v))
+ elem_list.append(elem)
+ return '&'.join(elem_list)
+
+def db_urldecode(qs):
+ """Database specific urldecode.
+
+ Decode key without '=' as None.
+ This also does not support one key several times.
+
+ Python implementation.
+ """
+
+ res = {}
+ for elem in qs.split('&'):
+ if not elem:
+ continue
+ pair = elem.split('=', 1)
+ name = urllib.unquote_plus(pair[0])
+
+ # keep only one instance around
+ name = intern(str(name))
+
+ if len(pair) == 1:
+ res[name] = None
+ else:
+ res[name] = urllib.unquote_plus(pair[1])
+ return res
+
+#
+# Remove C-like backslash escapes
+#
+
+_esc_re = r"\\([0-7]{1,3}|.)"
+_esc_rc = re.compile(_esc_re)
+_esc_map = {
+ 't': '\t',
+ 'n': '\n',
+ 'r': '\r',
+ 'a': '\a',
+ 'b': '\b',
+ "'": "'",
+ '"': '"',
+ '\\': '\\',
+}
+
+def _sub_unescape(m):
+ v = m.group(1)
+ if (len(v) == 1) and (v < '0' or v > '7'):
+ try:
+ return _esc_map[v]
+ except KeyError:
+ return v
+ else:
+ return chr(int(v, 8))
+
+def unescape(val):
+ """Removes C-style escapes from string.
+ Python implementation.
+ """
+ return _esc_rc.sub(_sub_unescape, val)
+
diff --git a/python/skytools/parsing.py b/python/skytools/parsing.py
new file mode 100644
index 00000000..1f4dd781
--- /dev/null
+++ b/python/skytools/parsing.py
@@ -0,0 +1,272 @@
+
+"""Various parsers for Postgres-specific data formats."""
+
+import re
+
+from skytools.quoting import unescape
+
+__all__ = ["parse_pgarray", "parse_logtriga_sql", "parse_tabbed_table", "parse_statements"]
+
+_rc_listelem = re.compile(r'( [^,"}]+ | ["] ( [^"\\]+ | [\\]. )* ["] )', re.X)
+
+# _parse_pgarray
+def parse_pgarray(array):
+ """ Parse Postgres array and return list of items inside it
+ Used to deserialize data recived from service layer parameters
+ """
+ if not array or array[0] != "{":
+ raise Exception("bad array format: must start with {")
+ res = []
+ pos = 1
+ while 1:
+ m = _rc_listelem.search(array, pos)
+ if not m:
+ break
+ pos2 = m.end()
+ item = array[pos:pos2]
+ if len(item) > 0 and item[0] == '"':
+ item = item[1:-1]
+ item = unescape(item)
+ res.append(item)
+
+ pos = pos2 + 1
+ if array[pos2] == "}":
+ break
+ elif array[pos2] != ",":
+ raise Exception("bad array format: expected ,} got " + array[pos2])
+ return res
+
+#
+# parse logtriga partial sql
+#
+
+class _logtriga_parser:
+ token_re = r"""
+ [ \t\r\n]*
+ ( [a-z][a-z0-9_]*
+ | ["] ( [^"\\]+ | \\. )* ["]
+ | ['] ( [^'\\]+ | \\. | [']['] )* [']
+ | [^ \t\r\n]
+ )"""
+ token_rc = None
+
+ def tokenizer(self, sql):
+ if not _logtriga_parser.token_rc:
+ _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I)
+ rc = self.token_rc
+
+ pos = 0
+ while 1:
+ m = rc.match(sql, pos)
+ if not m:
+ break
+ pos = m.end()
+ yield m.group(1)
+
+ def unquote_data(self, fields, values):
+ # unquote data and column names
+ data = {}
+ for k, v in zip(fields, values):
+ if k[0] == '"':
+ k = unescape(k[1:-1])
+ if len(v) == 4 and v.lower() == "null":
+ v = None
+ elif v[0] == "'":
+ v = unescape(v[1:-1])
+ data[k] = v
+ return data
+
+ def parse_insert(self, tk, fields, values):
+ # (col1, col2) values ('data', null)
+ if tk.next() != "(":
+ raise Exception("syntax error")
+ while 1:
+ fields.append(tk.next())
+ t = tk.next()
+ if t == ")":
+ break
+ elif t != ",":
+ raise Exception("syntax error")
+ if tk.next().lower() != "values":
+ raise Exception("syntax error")
+ if tk.next() != "(":
+ raise Exception("syntax error")
+ while 1:
+ t = tk.next()
+ if t == ")":
+ break
+ if t == ",":
+ continue
+ values.append(t)
+ tk.next()
+
+ def parse_update(self, tk, fields, values):
+ # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2'
+ while 1:
+ fields.append(tk.next())
+ if tk.next() != "=":
+ raise Exception("syntax error")
+ values.append(tk.next())
+
+ t = tk.next()
+ if t == ",":
+ continue
+ elif t.lower() == "where":
+ break
+ else:
+ raise Exception("syntax error")
+ while 1:
+ t = tk.next()
+ fields.append(t)
+ if tk.next() != "=":
+ raise Exception("syntax error")
+ values.append(tk.next())
+ t = tk.next()
+ if t.lower() != "and":
+ raise Exception("syntax error")
+
+ def parse_delete(self, tk, fields, values):
+ # pk1 = 'pk1' and pk2 = 'pk2'
+ while 1:
+ t = tk.next()
+ if t == "and":
+ continue
+ fields.append(t)
+ if tk.next() != "=":
+ raise Exception("syntax error")
+ values.append(tk.next())
+
+ def parse_sql(self, op, sql):
+ tk = self.tokenizer(sql)
+ fields = []
+ values = []
+ try:
+ if op == "I":
+ self.parse_insert(tk, fields, values)
+ elif op == "U":
+ self.parse_update(tk, fields, values)
+ elif op == "D":
+ self.parse_delete(tk, fields, values)
+ raise Exception("syntax error")
+ except StopIteration:
+ # last sanity check
+ if len(fields) == 0 or len(fields) != len(values):
+ raise Exception("syntax error")
+
+ return self.unquote_data(fields, values)
+
+def parse_logtriga_sql(op, sql):
+ """Parse partial SQL used by logtriga() back to data values.
+
+ Parser has following limitations:
+ - Expects standard_quoted_strings = off
+ - Does not support dollar quoting.
+ - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1))
+ - WHERE expression must not contain IS (NOT) NULL
+ - Does not support updateing pk value.
+
+ Returns dict of col->data pairs.
+ """
+ return _logtriga_parser().parse_sql(op, sql)
+
+
+def parse_tabbed_table(txt):
+ """Parse a tab-separated table into list of dicts.
+
+ Expect first row to be column names.
+
+ Very primitive.
+ """
+
+ txt = txt.replace("\r\n", "\n")
+ fields = None
+ data = []
+ for ln in txt.split("\n"):
+ if not ln:
+ continue
+ if not fields:
+ fields = ln.split("\t")
+ continue
+ cols = ln.split("\t")
+ if len(cols) != len(fields):
+ continue
+ row = dict(zip(fields, cols))
+ data.append(row)
+ return data
+
+
+_sql_token_re = r"""
+ ( [a-z][a-z0-9_$]*
+ | ["] ( [^"\\]+ | \\. )* ["]
+ | ['] ( [^'\\]+ | \\. | [']['] )* [']
+ | [$] ([_a-z][_a-z0-9]*)? [$]
+ | (?P<ws> \s+ | [/][*] | [-][-][^\n]* )
+ | .
+ )"""
+_sql_token_rc = None
+_copy_from_stdin_re = "copy.*from\s+stdin"
+_copy_from_stdin_rc = None
+
+def _sql_tokenizer(sql):
+ global _sql_token_rc, _copy_from_stdin_rc
+ if not _sql_token_rc:
+ _sql_token_rc = re.compile(_sql_token_re, re.X | re.I)
+ _copy_from_stdin_rc = re.compile(_copy_from_stdin_re, re.X | re.I)
+ rc = _sql_token_rc
+
+ pos = 0
+ while 1:
+ m = rc.match(sql, pos)
+ if not m:
+ break
+ pos = m.end()
+ tok = m.group(1)
+ ws = m.start('ws') >= 0 # it tok empty?
+ if tok == "/*":
+ end = sql.find("*/", pos)
+ if end < 0:
+ raise Exception("unterminated c comment")
+ pos = end + 2
+ tok = sql[ m.start() : pos]
+ elif len(tok) > 1 and tok[0] == "$" and tok[-1] == "$":
+ end = sql.find(tok, pos)
+ if end < 0:
+ raise Exception("unterminated dollar string")
+ pos = end + len(tok)
+ tok = sql[ m.start() : pos]
+ yield (ws, tok)
+
+def parse_statements(sql):
+ """Parse multi-statement string into separate statements.
+
+ Returns list of statements.
+ """
+
+ tk = _sql_tokenizer(sql)
+ tokens = []
+ pcount = 0 # '(' level
+ while 1:
+ try:
+ ws, t = tk.next()
+ except StopIteration:
+ break
+ # skip whitespace and comments before statement
+ if len(tokens) == 0 and ws:
+ continue
+ # keep the rest
+ tokens.append(t)
+ if t == "(":
+ pcount += 1
+ elif t == ")":
+ pcount -= 1
+ elif t == ";" and pcount == 0:
+ sql = "".join(tokens)
+ if _copy_from_stdin_rc.match(sql):
+ raise Exception("copy from stdin not supported")
+ yield ("".join(tokens))
+ tokens = []
+ if len(tokens) > 0:
+ yield ("".join(tokens))
+ if pcount != 0:
+ raise Exception("syntax error - unbalanced parenthesis")
+
diff --git a/python/skytools/quoting.py b/python/skytools/quoting.py
index 594646a4..10d4626a 100644
--- a/python/skytools/quoting.py
+++ b/python/skytools/quoting.py
@@ -4,49 +4,23 @@
import urllib, re
-from skytools.psycopgwrapper import QuotedString
-
__all__ = [
"quote_literal", "quote_copy", "quote_bytea_raw",
+ "db_urlencode", "db_urldecode", "unescape",
+
"quote_bytea_literal", "quote_bytea_copy", "quote_statement",
- "quote_ident", "quote_fqident", "quote_json",
- "db_urlencode", "db_urldecode", "unescape", "unescape_copy"
+ "quote_ident", "quote_fqident", "quote_json", "unescape_copy"
]
+try:
+ from _cquoting import *
+except ImportError:
+ from _pyquoting import *
+
#
# SQL quoting
#
-def quote_literal(s):
- """Quote a literal value for SQL.
-
- Surronds it with single-quotes.
- """
-
- if s == None:
- return "null"
- s = QuotedString(str(s))
- return str(s)
-
-def quote_copy(s):
- """Quoting for copy command."""
-
- if s == None:
- return "\\N"
- s = str(s)
- s = s.replace("\\", "\\\\")
- s = s.replace("\t", "\\t")
- s = s.replace("\n", "\\n")
- s = s.replace("\r", "\\r")
- return s
-
-def quote_bytea_raw(s):
- """Quoting for bytea parser."""
-
- if s == None:
- return None
- return s.replace("\\", "\\\\").replace("\0", "\\000")
-
def quote_bytea_literal(s):
"""Quote bytea for regular SQL."""
@@ -125,214 +99,9 @@ def quote_json(s):
return "null"
return '"%s"' % _jsre.sub(_json_quote_char, s)
-#
-# Database specific urlencode and urldecode.
-#
-
-def db_urlencode(dict):
- """Database specific urlencode.
-
- Encode None as key without '='. That means that in "foo&bar=",
- foo is NULL and bar is empty string.
- """
-
- elem_list = []
- for k, v in dict.items():
- if v is None:
- elem = urllib.quote_plus(str(k))
- else:
- elem = urllib.quote_plus(str(k)) + '=' + urllib.quote_plus(str(v))
- elem_list.append(elem)
- return '&'.join(elem_list)
-
-def db_urldecode(qs):
- """Database specific urldecode.
-
- Decode key without '=' as None.
- This also does not support one key several times.
- """
-
- res = {}
- for elem in qs.split('&'):
- if not elem:
- continue
- pair = elem.split('=', 1)
- name = urllib.unquote_plus(pair[0])
-
- # keep only one instance around
- name = intern(name)
-
- if len(pair) == 1:
- res[name] = None
- else:
- res[name] = urllib.unquote_plus(pair[1])
- return res
-
-#
-# Remove C-like backslash escapes
-#
-
-_esc_re = r"\\([0-7][0-7][0-7]|.)"
-_esc_rc = re.compile(_esc_re)
-_esc_map = {
- 't': '\t',
- 'n': '\n',
- 'r': '\r',
- 'a': '\a',
- 'b': '\b',
- "'": "'",
- '"': '"',
- '\\': '\\',
-}
-
-def _sub_unescape(m):
- v = m.group(1)
- if len(v) == 1:
- return _esc_map[v]
- else:
- return chr(int(v, 8))
-
-def unescape(val):
- """Removes C-style escapes from string."""
- return _esc_rc.sub(_sub_unescape, val)
-
def unescape_copy(val):
"""Removes C-style escapes, also converts "\N" to None."""
if val == r"\N":
return None
return unescape(val)
-
-#
-# parse logtriga partial sql
-#
-
-class _logtriga_parser:
- token_re = r"""
- [ \t\r\n]*
- ( [a-z][a-z0-9_]*
- | ["] ( [^"\\]+ | \\. )* ["]
- | ['] ( [^'\\]+ | \\. | [']['] )* [']
- | [^ \t\r\n]
- )"""
- token_rc = None
-
- def tokenizer(self, sql):
- if not _logtriga_parser.token_rc:
- _logtriga_parser.token_rc = re.compile(self.token_re, re.X | re.I)
- rc = self.token_rc
-
- pos = 0
- while 1:
- m = rc.match(sql, pos)
- if not m:
- break
- pos = m.end()
- yield m.group(1)
-
- def unquote_data(self, fields, values):
- # unquote data and column names
- data = {}
- for k, v in zip(fields, values):
- if k[0] == '"':
- k = unescape(k[1:-1])
- if len(v) == 4 and v.lower() == "null":
- v = None
- elif v[0] == "'":
- v = unescape(v[1:-1])
- data[k] = v
- return data
-
- def parse_insert(self, tk, fields, values):
- # (col1, col2) values ('data', null)
- if tk.next() != "(":
- raise Exception("syntax error")
- while 1:
- fields.append(tk.next())
- t = tk.next()
- if t == ")":
- break
- elif t != ",":
- raise Exception("syntax error")
- if tk.next().lower() != "values":
- raise Exception("syntax error")
- if tk.next() != "(":
- raise Exception("syntax error")
- while 1:
- t = tk.next()
- if t == ")":
- break
- if t == ",":
- continue
- values.append(t)
- tk.next()
-
- def parse_update(self, tk, fields, values):
- # col1 = 'data1', col2 = null where pk1 = 'pk1' and pk2 = 'pk2'
- while 1:
- fields.append(tk.next())
- if tk.next() != "=":
- raise Exception("syntax error")
- values.append(tk.next())
-
- t = tk.next()
- if t == ",":
- continue
- elif t.lower() == "where":
- break
- else:
- raise Exception("syntax error")
- while 1:
- t = tk.next()
- fields.append(t)
- if tk.next() != "=":
- raise Exception("syntax error")
- values.append(tk.next())
- t = tk.next()
- if t.lower() != "and":
- raise Exception("syntax error")
-
- def parse_delete(self, tk, fields, values):
- # pk1 = 'pk1' and pk2 = 'pk2'
- while 1:
- t = tk.next()
- if t == "and":
- continue
- fields.append(t)
- if tk.next() != "=":
- raise Exception("syntax error")
- values.append(tk.next())
-
- def parse_sql(self, op, sql):
- tk = self.tokenizer(sql)
- fields = []
- values = []
- try:
- if op == "I":
- self.parse_insert(tk, fields, values)
- elif op == "U":
- self.parse_update(tk, fields, values)
- elif op == "D":
- self.parse_delete(tk, fields, values)
- raise Exception("syntax error")
- except StopIteration:
- # last sanity check
- if len(fields) == 0 or len(fields) != len(values):
- raise Exception("syntax error")
-
- return self.unquote_data(fields, values)
-
-def parse_logtriga_sql(op, sql):
- """Parse partial SQL used by logtriga() back to data values.
-
- Parser has following limitations:
- - Expects standard_quoted_strings = off
- - Does not support dollar quoting.
- - Does not support complex expressions anywhere. (hashtext(col1) = hashtext(val1))
- - WHERE expression must not contain IS (NOT) NULL
- - Does not support updateing pk value.
-
- Returns dict of col->data pairs.
- """
- return _logtriga_parser().parse_sql(op, sql)
-