summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNoah Misch2017-11-11 19:10:53 +0000
committerNoah Misch2017-11-11 19:11:24 +0000
commit2f4061aff504049767602927e69c85b5b4621273 (patch)
tree3eac87beb471b081274635258b8cf8a6c29f8296 /src
parent0bcdab58e83b2ac2caf4bd78e1583c11df97d4f1 (diff)
Ignore XML declaration in xpath_internal(), for UTF8 databases.
When a value contained an XML declaration naming some other encoding, this function interpreted UTF8 bytes as the named encoding, yielding mojibake. xml_parse() already has similar logic. This would be necessary but not sufficient for non-UTF8 databases, so preserve behavior there until the xpath facility can support such databases comprehensively. Back-patch to 9.3 (all supported versions). Pavel Stehule and Noah Misch Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/adt/xml.c14
-rw-r--r--src/test/regress/expected/xml.out31
-rw-r--r--src/test/regress/expected/xml_1.out35
-rw-r--r--src/test/regress/expected/xml_2.out31
-rw-r--r--src/test/regress/sql/xml.sql32
5 files changed, 142 insertions, 1 deletions
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 05bec034dd1..9d3b189c940 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -3781,6 +3781,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
int32 xpath_len;
xmlChar *string;
xmlChar *xpath_expr;
+ size_t xmldecl_len = 0;
int i;
int ndim;
Datum *ns_names_uris;
@@ -3841,6 +3842,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
memcpy(xpath_expr, VARDATA(xpath_expr_text), xpath_len);
xpath_expr[xpath_len] = '\0';
+ /*
+ * In a UTF8 database, skip any xml declaration, which might assert
+ * another encoding. Ignore parse_xml_decl() failure, letting
+ * xmlCtxtReadMemory() report parse errors. Documentation disclaims
+ * xpath() support for non-ASCII data in non-UTF8 databases, so leave
+ * those scenarios bug-compatible with historical behavior.
+ */
+ if (GetDatabaseEncoding() == PG_UTF8)
+ parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL);
+
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
PG_TRY();
@@ -3855,7 +3866,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
if (ctxt == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context");
- doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
+ doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
+ len - xmldecl_len, NULL, NULL, 0);
if (doc == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document");
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 9b7b393c85f..39d94eaf561 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -653,6 +653,37 @@ SELECT xpath('/nosuchtag', '<root/>');
{}
(1 row)
+-- Round-trip non-ASCII data through xpath().
+DO $$
+DECLARE
+ xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
+ degree_symbol text;
+ res xml[];
+BEGIN
+ -- Per the documentation, xpath() doesn't work on non-ASCII data when
+ -- the server encoding is not UTF8. The EXCEPTION block below,
+ -- currently dead code, will be relevant if we remove this limitation.
+ IF current_setting('server_encoding') <> 'UTF8' THEN
+ RAISE LOG 'skip: encoding % unsupported for xml',
+ current_setting('server_encoding');
+ RETURN;
+ END IF;
+
+ degree_symbol := convert_from('\xc2b0', 'UTF8');
+ res := xpath('text()', (xml_declaration ||
+ '<x>' || degree_symbol || '</x>')::xml);
+ IF degree_symbol <> res[1]::text THEN
+ RAISE 'expected % (%), got % (%)',
+ degree_symbol, convert_to(degree_symbol, 'UTF8'),
+ res[1], convert_to(res[1]::text, 'UTF8');
+ END IF;
+EXCEPTION
+ -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
+ WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
+ -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
+ WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
+END
+$$;
-- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
xmlexists
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index 97382499da6..ab136376c28 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -564,6 +564,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>');
^
DETAIL: This functionality requires the server to be built with libxml support.
HINT: You need to rebuild PostgreSQL using --with-libxml.
+-- Round-trip non-ASCII data through xpath().
+DO $$
+DECLARE
+ xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
+ degree_symbol text;
+ res xml[];
+BEGIN
+ -- Per the documentation, xpath() doesn't work on non-ASCII data when
+ -- the server encoding is not UTF8. The EXCEPTION block below,
+ -- currently dead code, will be relevant if we remove this limitation.
+ IF current_setting('server_encoding') <> 'UTF8' THEN
+ RAISE LOG 'skip: encoding % unsupported for xml',
+ current_setting('server_encoding');
+ RETURN;
+ END IF;
+
+ degree_symbol := convert_from('\xc2b0', 'UTF8');
+ res := xpath('text()', (xml_declaration ||
+ '<x>' || degree_symbol || '</x>')::xml);
+ IF degree_symbol <> res[1]::text THEN
+ RAISE 'expected % (%), got % (%)',
+ degree_symbol, convert_to(degree_symbol, 'UTF8'),
+ res[1], convert_to(res[1]::text, 'UTF8');
+ END IF;
+EXCEPTION
+ -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
+ WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
+ -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
+ WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
+END
+$$;
+ERROR: unsupported XML feature
+DETAIL: This functionality requires the server to be built with libxml support.
+HINT: You need to rebuild PostgreSQL using --with-libxml.
+CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment
-- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
ERROR: unsupported XML feature
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index 72f0be3557d..eb1fd92b3e1 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -635,6 +635,37 @@ SELECT xpath('/nosuchtag', '<root/>');
{}
(1 row)
+-- Round-trip non-ASCII data through xpath().
+DO $$
+DECLARE
+ xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
+ degree_symbol text;
+ res xml[];
+BEGIN
+ -- Per the documentation, xpath() doesn't work on non-ASCII data when
+ -- the server encoding is not UTF8. The EXCEPTION block below,
+ -- currently dead code, will be relevant if we remove this limitation.
+ IF current_setting('server_encoding') <> 'UTF8' THEN
+ RAISE LOG 'skip: encoding % unsupported for xml',
+ current_setting('server_encoding');
+ RETURN;
+ END IF;
+
+ degree_symbol := convert_from('\xc2b0', 'UTF8');
+ res := xpath('text()', (xml_declaration ||
+ '<x>' || degree_symbol || '</x>')::xml);
+ IF degree_symbol <> res[1]::text THEN
+ RAISE 'expected % (%), got % (%)',
+ degree_symbol, convert_to(degree_symbol, 'UTF8'),
+ res[1], convert_to(res[1]::text, 'UTF8');
+ END IF;
+EXCEPTION
+ -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
+ WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
+ -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
+ WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
+END
+$$;
-- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
xmlexists
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index ce87d426842..62a4736275f 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -186,6 +186,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>');
SELECT xpath('name(/*)', '<root><sub/><sub/></root>');
SELECT xpath('/nosuchtag', '<root/>');
+-- Round-trip non-ASCII data through xpath().
+DO $$
+DECLARE
+ xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
+ degree_symbol text;
+ res xml[];
+BEGIN
+ -- Per the documentation, xpath() doesn't work on non-ASCII data when
+ -- the server encoding is not UTF8. The EXCEPTION block below,
+ -- currently dead code, will be relevant if we remove this limitation.
+ IF current_setting('server_encoding') <> 'UTF8' THEN
+ RAISE LOG 'skip: encoding % unsupported for xml',
+ current_setting('server_encoding');
+ RETURN;
+ END IF;
+
+ degree_symbol := convert_from('\xc2b0', 'UTF8');
+ res := xpath('text()', (xml_declaration ||
+ '<x>' || degree_symbol || '</x>')::xml);
+ IF degree_symbol <> res[1]::text THEN
+ RAISE 'expected % (%), got % (%)',
+ degree_symbol, convert_to(degree_symbol, 'UTF8'),
+ res[1], convert_to(res[1]::text, 'UTF8');
+ END IF;
+EXCEPTION
+ -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
+ WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
+ -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
+ WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
+END
+$$;
+
-- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');