/*------------------------------------------------------------------------- * * parser.c * Main entry point/driver for PostgreSQL grammar * * Note that the grammar is not allowed to perform any table access * (since we need to be able to do basic parsing even while inside an * aborted transaction). Therefore, the data structures returned by * the grammar are "raw" parsetrees that still need to be analyzed by * analyze.c and related files. * * * Portions Copyright (c) 2003-2024, PgPool Global Development Group * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/parser/parser.c * *------------------------------------------------------------------------- */ #include #include #include "pool_parser.h" #include "utils/palloc.h" #include "gramparse.h" /* required before parser/gram.h! */ #include "gram.h" #include "gram_minimal.h" #include "parser.h" #include "pg_list.h" #include "kwlist_d.h" #include "pg_wchar.h" #include "makefuncs.h" #include "utils/elog.h" #include "scansup.h" int server_version_num = 0; static pg_enc server_encoding = PG_SQL_ASCII; static int parse_version(const char *versionString); static bool check_uescapechar(unsigned char escape); static char *str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner); /* * raw_parser * Given a query in string form, do lexical and grammatical analysis. * * Returns a list of raw (un-analyzed) parse trees. The contents of the * list have the form required by the specified RawParseMode. * Set *error to true if there's any parse error. */ List * raw_parser(const char *str, RawParseMode mode, int len, bool *error, bool use_minimal) { core_yyscan_t yyscanner; base_yy_extra_type yyextra; int yyresult; MemoryContext oldContext = CurrentMemoryContext; /* initialize error flag */ *error = false; /* initialize the flex scanner */ yyscanner = scanner_init(str, len, &yyextra.core_yy_extra, &ScanKeywords, ScanKeywordTokens); /* base_yylex() only needs us to initialize the lookahead token, if any */ if (mode == RAW_PARSE_DEFAULT) yyextra.have_lookahead = false; else { /* this array is indexed by RawParseMode enum */ static const int mode_token[] = { [RAW_PARSE_DEFAULT] = 0, [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME, [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR, [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1, [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2, [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3, }; yyextra.have_lookahead = true; yyextra.lookahead_token = mode_token[mode]; yyextra.lookahead_yylloc = 0; yyextra.lookahead_end = NULL; } /* initialize the bison parser */ if (use_minimal) { ereport(DEBUG2, (errmsg("invoking the minimal parser"))); minimal_parser_init(&yyextra); } else { ereport(DEBUG2, (errmsg("invoking the standard parser"))); parser_init(&yyextra); } PG_TRY(); { /* Parse! */ if (use_minimal) yyresult = minimal_base_yyparse(yyscanner); else yyresult = base_yyparse(yyscanner); /* Clean up (release memory) */ scanner_finish(yyscanner); } PG_CATCH(); { MemoryContextSwitchTo(oldContext); scanner_finish(yyscanner); yyresult = -1; FlushErrorState(); } PG_END_TRY(); if (yyresult) /* error */ { *error = true; return NIL; } return yyextra.parsetree; } /* * XXX: Currently we only process the first element of the parse tree. * rest of multiple statements are silently discarded. */ Node * raw_parser2(List *parse_tree_list) { Node *node = NULL; RawStmt *rstmt; rstmt = (RawStmt *) lfirst(list_head(parse_tree_list)); node = (Node *) rstmt->stmt; return node; } //"INSERT INTO foo VALUES(1)" Node * get_dummy_insert_query_node(void) { InsertStmt *insert = makeNode(InsertStmt); SelectStmt *select = makeNode(SelectStmt); select->valuesLists = list_make1(makeInteger(1)); insert->relation = makeRangeVar("pgpool", "foo", 0); insert->selectStmt = (Node*)select; return (Node *)insert; } List * get_dummy_read_query_tree(void) { RawStmt *rs; SelectStmt *n = makeNode(SelectStmt); n->targetList = list_make1(makeString("pgpool: unable to parse the query")); rs = makeNode(RawStmt); rs->stmt = (Node *)n; rs->stmt_location = 0; rs->stmt_len = 0; /* might get changed later */ return list_make1((Node *)rs); } List * get_dummy_write_query_tree(void) { ColumnRef *c1,*c2; RawStmt *rs; DeleteStmt *n = makeNode(DeleteStmt); n->relation = makeRangeVar("pgpool", "foo", 0); c1 = makeNode(ColumnRef); c1->fields = list_make1(makeString("col")); c2 = makeNode(ColumnRef); c2->fields = list_make1(makeString("pgpool: unable to parse the query")); n->whereClause = (Node*)makeSimpleA_Expr(AEXPR_OP, "=", (Node*)c1, (Node*)c2, 0); /* * Assign the node directly to the parsetree and exit the scanner * we don't want to keep parsing for information we don't need */ rs = makeNode(RawStmt); rs->stmt = (Node *)n; rs->stmt_location = 0; rs->stmt_len = 0; /* might get changed later */ return list_make1((Node *)rs); } /* * from src/backend/commands/define.c * Extract an int32 value from a DefElem. */ int32 defGetInt32(DefElem *def) { if (def->arg == NULL) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("%s requires an integer value", def->defname))); switch (nodeTag(def->arg)) { case T_Integer: return (int32) intVal(def->arg); default: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("%s requires an integer value", def->defname))); } return 0; /* keep compiler quiet */ } /* * Intermediate filter between parser and core lexer (core_yylex in scan.l). * * This filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token * lookahead by replacing tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. * * We also use this filter to convert UIDENT and USCONST sequences into * plain IDENT and SCONST tokens. While that could be handled by additional * productions in the main grammar, it's more efficient to do it like this. * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). */ int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); int cur_token; int next_token; int cur_token_length; YYLTYPE cur_yylloc; /* Get next token --- we might already have it */ if (yyextra->have_lookahead) { cur_token = yyextra->lookahead_token; lvalp->core_yystype = yyextra->lookahead_yylval; *llocp = yyextra->lookahead_yylloc; if (yyextra->lookahead_end) *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; yyextra->have_lookahead = false; } else cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); /* * If this token isn't one that requires lookahead, just return it. If it * does, determine the token length. (We could get that via strlen(), but * since we have such a small set of possibilities, hardwiring seems * feasible and more efficient --- at least for the fixed-length cases.) */ switch (cur_token) { case FORMAT: cur_token_length = 6; break; case NOT: cur_token_length = 3; break; case NULLS_P: cur_token_length = 5; break; case WITH: cur_token_length = 4; break; case UIDENT: case USCONST: cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); break; case WITHOUT: cur_token_length = 7; break; default: return cur_token; } /* * Identify end+1 of current token. core_yylex() has temporarily stored a * '\0' here, and will undo that when we call it again. We need to redo * it to fully revert the lookahead call for error reporting purposes. */ yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length; Assert(*(yyextra->lookahead_end) == '\0'); /* * Save and restore *llocp around the call. It might look like we could * avoid this by just passing &lookahead_yylloc to core_yylex(), but that * does not work because flex actually holds onto the last-passed pointer * internally, and will use that for error reporting. We need any error * reports to point to the current token, not the next one. */ cur_yylloc = *llocp; /* Get next token, saving outputs into lookahead variables */ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); yyextra->lookahead_token = next_token; yyextra->lookahead_yylloc = *llocp; *llocp = cur_yylloc; /* Now revert the un-truncation of the current token */ yyextra->lookahead_hold_char = *(yyextra->lookahead_end); *(yyextra->lookahead_end) = '\0'; yyextra->have_lookahead = true; /* Replace cur_token if needed, based on lookahead */ switch (cur_token) { case FORMAT: /* Replace FORMAT by FORMAT_LA if it's followed by JSON */ switch (next_token) { case JSON: cur_token = FORMAT_LA; break; } break; case NOT: /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ switch (next_token) { case BETWEEN: case IN_P: case LIKE: case ILIKE: case SIMILAR: cur_token = NOT_LA; break; } break; case NULLS_P: /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ switch (next_token) { case FIRST_P: case LAST_P: cur_token = NULLS_LA; break; } break; case WITH: /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ switch (next_token) { case TIME: case ORDINALITY: cur_token = WITH_LA; break; } break; case WITHOUT: /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */ switch (next_token) { case TIME: cur_token = WITHOUT_LA; break; } break; case UIDENT: case USCONST: /* Look ahead for UESCAPE */ if (next_token == UESCAPE) { /* Yup, so get third token, which had better be SCONST */ const char *escstr; /* Again save and restore *llocp */ cur_yylloc = *llocp; /* Un-truncate current token so errors point to third token */ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; /* Get third token */ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); /* If we throw error here, it will point to third token */ if (next_token != SCONST) scanner_yyerror("UESCAPE must be followed by a simple string literal", yyscanner); escstr = yyextra->lookahead_yylval.str; if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) scanner_yyerror("invalid Unicode escape character", yyscanner); /* Now restore *llocp; errors will point to first token */ *llocp = cur_yylloc; /* Apply Unicode conversion */ lvalp->core_yystype.str = str_udeescape(lvalp->core_yystype.str, escstr[0], *llocp, yyscanner); /* * We don't need to revert the un-truncation of UESCAPE. What * we do want to do is clear have_lookahead, thereby consuming * all three tokens. */ yyextra->have_lookahead = false; } else { /* No UESCAPE, so convert using default escape character */ lvalp->core_yystype.str = str_udeescape(lvalp->core_yystype.str, '\\', *llocp, yyscanner); } if (cur_token == UIDENT) { /* It's an identifier, so truncate as appropriate */ truncate_identifier(lvalp->core_yystype.str, strlen(lvalp->core_yystype.str), true); cur_token = IDENT; } else if (cur_token == USCONST) { cur_token = SCONST; } } return cur_token; } /* convert hex digit (caller should have verified that) to value */ static unsigned int hexval(unsigned char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; elog(ERROR, "invalid hexadecimal digit"); return 0; /* not reached */ } /* is Unicode code point acceptable? */ static void check_unicode_value(pg_wchar c) { if (!is_valid_unicode_codepoint(c)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode escape value"))); } /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ static bool check_uescapechar(unsigned char escape) { if (isxdigit(escape) || escape == '+' || escape == '\'' || escape == '"' || scanner_isspace(escape)) return false; else return true; } /* * Process Unicode escapes in "str", producing a palloc'd plain string * * escape: the escape character to use * position: start position of U&'' or U&"" string token * yyscanner: context information needed for error reports */ static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner) { const char *in; char *new, *out; size_t new_len; pg_wchar pair_first = 0; ScannerCallbackState scbstate; /* * Guesstimate that result will be no longer than input, but allow enough * padding for Unicode conversion. */ new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1; new = palloc(new_len); in = str; out = new; while (*in) { /* Enlarge string if needed */ size_t out_dist = out - new; if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1)) { new_len *= 2; new = repalloc(new, new_len); out = new + out_dist; } if (in[0] == escape) { /* * Any errors reported while processing this escape sequence will * have an error cursor pointing at the escape. */ setup_scanner_errposition_callback(&scbstate, yyscanner, in - str + position + 3); /* 3 for U&" */ if (in[1] == escape) { if (pair_first) goto invalid_pair; *out++ = escape; in += 2; } else if (isxdigit((unsigned char) in[1]) && isxdigit((unsigned char) in[2]) && isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4])) { pg_wchar unicode; unicode = (hexval(in[1]) << 12) + (hexval(in[2]) << 8) + (hexval(in[3]) << 4) + hexval(in[4]); check_unicode_value(unicode); if (pair_first) { if (is_utf16_surrogate_second(unicode)) { unicode = surrogate_pair_to_codepoint(pair_first, unicode); pair_first = 0; } else goto invalid_pair; } else if (is_utf16_surrogate_second(unicode)) goto invalid_pair; if (is_utf16_surrogate_first(unicode)) pair_first = unicode; else { pg_unicode_to_server(unicode, (unsigned char *) out); out += strlen(out); } in += 5; } else if (in[1] == '+' && isxdigit((unsigned char) in[2]) && isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4]) && isxdigit((unsigned char) in[5]) && isxdigit((unsigned char) in[6]) && isxdigit((unsigned char) in[7])) { pg_wchar unicode; unicode = (hexval(in[2]) << 20) + (hexval(in[3]) << 16) + (hexval(in[4]) << 12) + (hexval(in[5]) << 8) + (hexval(in[6]) << 4) + hexval(in[7]); check_unicode_value(unicode); if (pair_first) { if (is_utf16_surrogate_second(unicode)) { unicode = surrogate_pair_to_codepoint(pair_first, unicode); pair_first = 0; } else goto invalid_pair; } else if (is_utf16_surrogate_second(unicode)) goto invalid_pair; if (is_utf16_surrogate_first(unicode)) pair_first = unicode; else { pg_unicode_to_server(unicode, (unsigned char *) out); out += strlen(out); } in += 8; } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode escape"), errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); cancel_scanner_errposition_callback(&scbstate); } else { if (pair_first) goto invalid_pair; *out++ = *in++; } } /* unfinished surrogate pair? */ if (pair_first) goto invalid_pair; *out = '\0'; return new; /* * We might get here with the error callback active, or not. Call * scanner_errposition to make sure an error cursor appears; if the * callback is active, this is duplicative but harmless. */ invalid_pair: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode surrogate pair"), scanner_errposition(in - str + position + 3, /* 3 for U&" */ yyscanner))); return NULL; /* keep compiler quiet */ } int minimal_base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { return base_yylex(lvalp, llocp, yyscanner); } static int parse_version(const char *versionString) { int cnt; int vmaj, vmin, vrev; cnt = sscanf(versionString, "%d.%d.%d", &vmaj, &vmin, &vrev); if (cnt == 2) { vrev = 0; } else if (cnt == 1) { vmin = 0; vrev = 0; } return (100 * vmaj + vmin) * 100 + vrev; } void parser_set_param(const char *name, const char *value) { if (strcmp(name, "server_version") == 0) { server_version_num = parse_version(value); } else if (strcmp(name, "server_encoding") == 0) { if (strcmp(value, "UTF8") == 0) server_encoding = PG_UTF8; else server_encoding = PG_SQL_ASCII; } else if (strcmp(name, "standard_conforming_strings") == 0) { if (strcmp(value, "on") == 0) standard_conforming_strings = true; else standard_conforming_strings = false; } } int GetDatabaseEncoding(void) { return server_encoding; } int pg_mblen(const char *mbstr) { return pg_utf_mblen((const unsigned char *) mbstr); }