summaryrefslogtreecommitdiff
path: root/src/backend/parser
diff options
context:
space:
mode:
authorTom Lane2004-02-21 00:34:53 +0000
committerTom Lane2004-02-21 00:34:53 +0000
commit59f9a0b9df0d224bb62ff8ec5b65e0b187655742 (patch)
tree17fc75064e4925afc08824727e41dfcc9c29f3a3 /src/backend/parser
parent1d567aee070b7a51fbdc74821237d5a5ae2caf8f (diff)
Implement a solution to the 'Turkish locale downcases I incorrectly'
problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area.
Diffstat (limited to 'src/backend/parser')
-rw-r--r--src/backend/parser/keywords.c10
-rw-r--r--src/backend/parser/scan.l44
-rw-r--r--src/backend/parser/scansup.c78
3 files changed, 87 insertions, 45 deletions
diff --git a/src/backend/parser/keywords.c b/src/backend/parser/keywords.c
index 57e020c1080..a94786690ed 100644
--- a/src/backend/parser/keywords.c
+++ b/src/backend/parser/keywords.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.144 2003/11/29 19:51:51 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.145 2004/02/21 00:34:52 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -369,17 +369,13 @@ ScanKeywordLookup(const char *text)
/*
* Apply an ASCII-only downcasing. We must not use tolower() since it
- * may produce the wrong translation in some locales (eg, Turkish),
- * and we don't trust isupper() very much either. In an ASCII-based
- * encoding the tests against A and Z are sufficient, but we also
- * check isupper() so that we will work correctly under EBCDIC. The
- * actual case conversion step should work for either ASCII or EBCDIC.
+ * may produce the wrong translation in some locales (eg, Turkish).
*/
for (i = 0; i < len; i++)
{
char ch = text[i];
- if (ch >= 'A' && ch <= 'Z' && isupper((unsigned char) ch))
+ if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
word[i] = ch;
}
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 13cbfb9895e..caab9a002cf 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -10,7 +10,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.113 2004/02/19 19:11:30 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.114 2004/02/21 00:34:52 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -27,6 +27,7 @@
#include "parser/keywords.h"
/* Not needed now that this file is compiled as part of gram.y */
/* #include "parser/parse.h" */
+#include "parser/scansup.h"
#include "utils/builtins.h"
#include "mb/pg_wchar.h"
@@ -395,23 +396,15 @@ other .
startlit();
}
<xd>{xdstop} {
+ char *ident;
+
BEGIN(INITIAL);
if (literallen == 0)
yyerror("zero-length delimited identifier");
+ ident = litbufdup();
if (literallen >= NAMEDATALEN)
- {
- int len;
-
- len = pg_mbcliplen(literalbuf, literallen,
- NAMEDATALEN-1);
- ereport(NOTICE,
- (errcode(ERRCODE_NAME_TOO_LONG),
- errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
- literalbuf, len, literalbuf)));
- literalbuf[len] = '\0';
- literallen = len;
- }
- yylval.str = litbufdup();
+ truncate_identifier(ident, literallen, true);
+ yylval.str = ident;
return IDENT;
}
<xd>{xddouble} {
@@ -537,7 +530,6 @@ other .
{identifier} {
const ScanKeyword *keyword;
char *ident;
- int i;
/* Is it a keyword? */
keyword = ScanKeywordLookup(yytext);
@@ -550,28 +542,8 @@ other .
/*
* No. Convert the identifier to lower case, and truncate
* if necessary.
- *
- * Note: here we use a locale-dependent case conversion,
- * which seems appropriate under standard SQL rules, whereas
- * the keyword comparison was NOT locale-dependent.
*/
- ident = pstrdup(yytext);
- for (i = 0; ident[i]; i++)
- {
- if (isupper((unsigned char) ident[i]))
- ident[i] = tolower((unsigned char) ident[i]);
- }
- if (i >= NAMEDATALEN)
- {
- int len;
-
- len = pg_mbcliplen(ident, i, NAMEDATALEN-1);
- ereport(NOTICE,
- (errcode(ERRCODE_NAME_TOO_LONG),
- errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
- ident, len, ident)));
- ident[len] = '\0';
- }
+ ident = downcase_truncate_identifier(yytext, yyleng, true);
yylval.str = ident;
return IDENT;
}
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 9177b858a79..76c620b394e 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.25 2003/11/29 19:51:52 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.26 2004/02/21 00:34:53 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -19,6 +19,8 @@
#include "miscadmin.h"
#include "parser/scansup.h"
+#include "mb/pg_wchar.h"
+
/* ----------------
* scanstr
@@ -32,7 +34,7 @@
*/
char *
-scanstr(char *s)
+scanstr(const char *s)
{
char *newStr;
int len,
@@ -109,3 +111,75 @@ scanstr(char *s)
newStr[j] = '\0';
return newStr;
}
+
+
+/*
+ * downcase_truncate_identifier() --- do appropriate downcasing and
+ * truncation of an unquoted identifier. Optionally warn of truncation.
+ *
+ * Returns a palloc'd string containing the adjusted identifier.
+ *
+ * Note: in some usages the passed string is not null-terminated.
+ *
+ * Note: the API of this function is designed to allow for downcasing
+ * transformations that increase the string length, but we don't yet
+ * support that. If you want to implement it, you'll need to fix
+ * SplitIdentifierString() in utils/adt/varlena.c.
+ */
+char *
+downcase_truncate_identifier(const char *ident, int len, bool warn)
+{
+ char *result;
+ int i;
+
+ result = palloc(len + 1);
+ /*
+ * SQL99 specifies Unicode-aware case normalization, which we don't yet
+ * have the infrastructure for. Instead we use tolower() to provide a
+ * locale-aware translation. However, there are some locales where this
+ * is not right either (eg, Turkish may do strange things with 'i' and
+ * 'I'). Our current compromise is to use tolower() for characters with
+ * the high bit set, and use an ASCII-only downcasing for 7-bit
+ * characters.
+ */
+ for (i = 0; i < len; i++)
+ {
+ unsigned char ch = (unsigned char) ident[i];
+
+ if (ch >= 'A' && ch <= 'Z')
+ ch += 'a' - 'A';
+ else if (ch >= 0x80 && isupper(ch))
+ ch = tolower(ch);
+ result[i] = (char) ch;
+ }
+ result[i] = '\0';
+
+ if (i >= NAMEDATALEN)
+ truncate_identifier(result, i, warn);
+
+ return result;
+}
+
+/*
+ * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
+ *
+ * The given string is modified in-place, if necessary. A warning is
+ * issued if requested.
+ *
+ * We require the caller to pass in the string length since this saves a
+ * strlen() call in some common usages.
+ */
+void
+truncate_identifier(char *ident, int len, bool warn)
+{
+ if (len >= NAMEDATALEN)
+ {
+ len = pg_mbcliplen(ident, len, NAMEDATALEN-1);
+ if (warn)
+ ereport(NOTICE,
+ (errcode(ERRCODE_NAME_TOO_LONG),
+ errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
+ ident, len, ident)));
+ ident[len] = '\0';
+ }
+}