From 21db7b93aa05dddf967447cd069dbb5e6d88eff2 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 21 Sep 2009 22:22:07 +0000 Subject: [PATCH] Surrogate pair support for U& string and identifier syntax This is mainly to make the functionality consistent with the proposed \u escape syntax. --- doc/src/sgml/syntax.sgml | 8 +++++ src/backend/parser/scan.l | 75 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 7637eab58f..2e20b735d9 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -238,6 +238,10 @@ U&"d!0061t!+000061" UESCAPE '!' The Unicode escape syntax works only when the server encoding is UTF8. When other server encodings are used, only code points in the ASCII range (up to \007F) can be specified. + Both the 4-digit and the 6-digit form can be used to specify + UTF-16 surrogate pairs to compose characters with code points + larger than \FFFF (although the availability of + the 6-digit form technically makes this unnecessary). @@ -497,6 +501,10 @@ U&'d!0061t!+000061' UESCAPE '!' UTF8. When other server encodings are used, only code points in the ASCII range (up to \007F) can be specified. + Both the 4-digit and the 6-digit form can be used to specify + UTF-16 surrogate pairs to compose characters with code points + larger than \FFFF (although the availability + of the 6-digit form technically makes this unnecessary). diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index f404f9dc8b..4dcebe8f8d 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner) } } +static bool +is_utf16_surrogate_first(pg_wchar c) +{ + return (c >= 0xD800 && c <= 0xDBFF); +} + +static bool +is_utf16_surrogate_second(pg_wchar c) +{ + return (c >= 0xDC00 && c <= 0xDFFF); +} + +static pg_wchar +surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) +{ + return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); +} + static char * litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { char *new; char *litbuf, *in, *out; + pg_wchar pair_first = 0; if (isxdigit(escape) || escape == '+' @@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { if (in[1] == escape) { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = escape; in += 2; } @@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); check_unicode_value(unicode, in, yyscanner); - unicode_to_utf8(unicode, (unsigned char *) out); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } + } + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } in += 5; - out += pg_mblen(out); } else if (in[1] == '+' && isxdigit(in[2]) && isxdigit(in[3]) @@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); check_unicode_value(unicode, in, yyscanner); - unicode_to_utf8(unicode, (unsigned char *) out); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } + } + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } in += 8; - out += pg_mblen(out); } else { @@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) } } else + { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = *in++; + } } *out = '\0'; -- 2.39.5