diff --git a/CLAUDE.md b/CLAUDE.md index 7ecdeb20e1..d30d2f45b0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,6 +39,10 @@ cargo clippy - Use `cargo run --release --quiet --features json_example --example cli FILE --DIALECT 2>&1 1>/dev/null | grep "Error during parsing"` — DEBUG logs flood stderr by default and obscure the actual parse error. - The release CLI (`target/release/examples/cli`) is rebuilt independently from `corpus-runner`. After parser edits, run `cargo build --release --example cli` before re-running single-file repros — otherwise you're testing the previous build and may report false positives. +### Backgrounding gotcha + +- **`cmd > log 2>&1 &` in a Bash tool call with `run_in_background: true` returns "completed" immediately** because the shell exits while the `&`-detached child keeps running. Don't trust the completion notification for detached processes — verify with `pgrep -f ` or arm a Monitor that polls `pgrep`. + ### Performance and Profiling **Critical:** Always profile BEFORE optimizing. Assumptions about bottlenecks are often wrong. @@ -115,6 +119,10 @@ node scripts/compare-corpus-reports.js target/corpus-report.json target/corpus-r - Analyze failures: parse `target/corpus-report.json` with Python to filter/group `test_results` by dialect or error pattern - **Always rebuild before corpus run**: `cargo build --release --bin corpus-runner` — stale binary produces stale reports - **Refresh baseline after each accepted commit**: `cp target/corpus-report.json target/corpus-report-baseline.json`. Otherwise `compare-corpus-reports.js` credits old deltas and can hide fresh regressions. +- `compare-corpus-reports.js` only lists *added* tests under "New Tests" — deleted/pruned files don't appear there. After a kernel-cll-corpus pipeline run, also check `git status -s` in that repo to see what was removed. +- **Pipeline reprocess (`make process` in kernel-cll-corpus) takes ~10 minutes** for the full corpus. Don't poll — arm a Monitor on `while pgrep -f pipeline.process; do sleep 15; done` and let it wake you. +- **Anonymizer-corruption signature**: `'s'` (the `'s'` placeholder string directly abutting an identifier/keyword, e.g. `'s'HOUR`, `'s'id_5`) is unique to anonymizer misalignment. Filter on exactly `'s'` — a broader `''` regex misaligns on multi-string SQL (`'foo','bar'`) and silently deletes hand-written sqlglot fixtures. +- **Query-log truncation heuristics** (`pipeline/process.py::_looks_truncated`) that worked without false positives: trailing punctuation (`,`/`(`/`=`/operator), trailing clause keyword (SELECT/FROM/BY/AS/…), and `CASE` count > `END` count. Removed ~4k Redshift query-log fragments. - Adding a real dispatch in `parse_create` for a previously-unsupported `CREATE ` shape can flag *new* corpus failures: files that slipped through the generic skip-until-semicolon fallback are now actually parsed. Either extend support, accept on a case-by-case basis, or fall back gracefully — but expect the delta. - **This repo is PUBLIC** (`getsynq/sqlparser-rs`, a fork of `apache/datafusion-sqlparser-rs`). Never put customer names, workspace IDs, or internal codenames into commit messages, branch names, PR titles, file names, or function names — even anonymized SQL content must be attributed generically. Every push is mirrored into GH Archive's permanent public dataset, which force-push cannot undo. - **Pulling real SQL from production Clickhouse for regression coverage**: `SELECT sql FROM schema.latest_sql_definitions FINAL WHERE workspace='' AND asset_type IN (...)`. `asset_type` codes from `proto/core/types/v1/asset_type.proto` that carry SQL bodies parseable by this library: diff --git a/src/bin/corpus_runner.rs b/src/bin/corpus_runner.rs index 84dc300d82..8b6424c1d8 100644 --- a/src/bin/corpus_runner.rs +++ b/src/bin/corpus_runner.rs @@ -26,7 +26,8 @@ fn normalize_dialect_name(name: &str) -> &str { /// Dialects without a dedicated parser fall back to a related dialect or to /// `GenericDialect` rather than being silently skipped, so corpus stats reflect /// every file under `tests/corpus/`. Aliases are best-effort: -/// - `presto` / `athena` use Trino-style SQL → Generic (same as our `trino`) +/// - `presto` uses Trino-style SQL → Trino +/// - `athena` uses Hive-style DDL on top of Trino-style DML → Hive /// - `tsql` / `fabric` use T-SQL → MsSql /// - `spark` uses Spark SQL → Databricks /// - `materialize` is Postgres-compatible → Postgres @@ -38,7 +39,8 @@ fn dialect_for_name(name: &str) -> Box { return d; } let alias: &str = match base_name.as_str() { - "presto" | "athena" => "trino", + "presto" => "trino", + "athena" => "hive", "tsql" | "fabric" => "mssql", "spark" => "databricks", "materialize" => "postgres", diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 4be29cf426..e8b1b88122 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -104,33 +104,86 @@ pub fn parse_create_stage( let mut comment = None; // [ internalStageParams | externalStageParams ] - let stage_params = parse_stage_params(parser)?; + let mut stage_params = parse_stage_params(parser)?; - // [ directoryTableParams ] - if parser.parse_keyword(Keyword::DIRECTORY) { - parser.expect_token(&Token::Eq)?; - directory_table_params = parse_parentheses_options(parser)?; - } - - // [ file_format] - if parser.parse_keyword(Keyword::FILE_FORMAT) { - parser.expect_token(&Token::Eq)?; - file_format = parse_parentheses_options(parser)?; - } - - // [ copy_options ] - if parser.parse_keyword(Keyword::COPY_OPTIONS) { - parser.expect_token(&Token::Eq)?; - copy_options = parse_parentheses_options(parser)?; - } - - // [ comment ] - if parser.parse_keyword(Keyword::COMMENT) { - parser.expect_token(&Token::Eq)?; - comment = Some(match parser.next_token().token { - Token::SingleQuotedString(word) => Ok(word), - _ => parser.expected("a comment statement", parser.peek_token()), - }?) + // CREATE STAGE option clauses (DIRECTORY, FILE_FORMAT, COPY_OPTIONS, + // COMMENT, plus URL/CREDENTIALS/etc that may also appear after the + // initial stage-params block) can come in any order. Loop until none + // of the recognised keywords appear. + // https://docs.snowflake.com/en/sql-reference/sql/create-stage + loop { + if parser.parse_keyword(Keyword::DIRECTORY) { + parser.expect_token(&Token::Eq)?; + directory_table_params = parse_parentheses_options(parser)?; + } else if parser.parse_keyword(Keyword::FILE_FORMAT) { + parser.expect_token(&Token::Eq)?; + if parser.peek_token_is(&Token::LParen) { + file_format = parse_parentheses_options(parser)?; + } else { + // Snowflake accepts FILE_FORMAT shorthand: + // FILE_FORMAT = '' (string) + // FILE_FORMAT = [.] (ident) + let next_token = parser.next_token(); + let value = match next_token.token { + Token::SingleQuotedString(s) => s, + Token::Word(w) => { + let mut name = w.value; + while parser.consume_token(&Token::Period) { + let part = parser.next_token(); + match part.token { + Token::Word(w) => { + name.push('.'); + name.push_str(&w.value); + } + _ => parser.expected("identifier after .", part)?, + } + } + name + } + _ => parser.expected("file format name", next_token)?, + }; + file_format.push(DataLoadingOption { + option_name: "FORMAT_NAME".to_string(), + option_type: DataLoadingOptionType::STRING, + value, + }); + } + } else if parser.parse_keyword(Keyword::COPY_OPTIONS) { + parser.expect_token(&Token::Eq)?; + copy_options = parse_parentheses_options(parser)?; + } else if parser.parse_keyword(Keyword::COMMENT) { + parser.expect_token(&Token::Eq)?; + comment = Some(match parser.next_token().token { + Token::SingleQuotedString(word) => Ok(word), + _ => parser.expected("a comment statement", parser.peek_token()), + }?); + } else if matches!( + parser.peek_token_kind(), + Token::Word(w) if matches!(w.keyword, + Keyword::URL | Keyword::CREDENTIALS | Keyword::STORAGE_INTEGRATION + | Keyword::ENDPOINT | Keyword::ENCRYPTION) + ) { + // Stage-params clauses can also appear after FILE_FORMAT etc.; + // re-enter the parser and merge the result. + let extra = parse_stage_params(parser)?; + if extra.url.is_some() { + stage_params.url = extra.url; + } + if extra.storage_integration.is_some() { + stage_params.storage_integration = extra.storage_integration; + } + if extra.endpoint.is_some() { + stage_params.endpoint = extra.endpoint; + } + if !extra.credentials.options.is_empty() { + stage_params.credentials = extra.credentials; + } + if !extra.encryption.options.is_empty() { + stage_params.encryption = extra.encryption; + } + } else { + break; + } } Ok(Statement::CreateStage { @@ -588,10 +641,23 @@ fn parse_parentheses_options(parser: &mut Parser) -> Result { + // Allow dotted object names (e.g. + // `FORMAT_NAME=schema.format`). + let mut value = word.value; + while parser.consume_token(&Token::Period) { + let part = parser.next_token(); + match part.token { + Token::Word(w) => { + value.push('.'); + value.push_str(&w.value); + } + _ => parser.expected("identifier after .", part)?, + } + } options.push(DataLoadingOption { option_name: key.value, option_type: DataLoadingOptionType::ENUM, - value: word.value, + value, }); Ok(()) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index a6965399fd..a77c94e8c8 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -706,6 +706,24 @@ impl<'a> Parser<'a> { self.expect_keyword(Keyword::TABLE)?; Ok(self.parse_create_table(true, false, None, false)?) } + // MySQL: `REPLACE [INTO] table ...` is INSERT-with-replace + // semantics (delete existing + insert new on PK conflict). + // https://dev.mysql.com/doc/refman/8.4/en/replace.html + // Dispatch to `parse_insert` which already handles a leading + // REPLACE for SQLite's `INSERT OR REPLACE` form. + Keyword::REPLACE + if dialect_of!(self is MySqlDialect | GenericDialect) + && matches!( + self.peek_token_kind(), + Token::Word(w) if w.keyword == Keyword::INTO + ) => + { + // REPLACE already consumed by the outer match; parse_insert + // will pick up at INTO. The replace-vs-insert distinction + // is lost in the AST, which is acceptable for grammar + // coverage — table/column refs are preserved. + Ok(self.parse_insert()?) + } Keyword::CACHE => Ok(self.parse_cache_table()?), Keyword::DROP => Ok(self.parse_drop()?), Keyword::DISCARD => Ok(self.parse_discard()?), @@ -1121,7 +1139,7 @@ impl<'a> Parser<'a> { // WHERE clauses of the old-style comma-join syntax, but we recognize // the suffix anywhere to keep parsing simple; the AST node preserves // the inner expression so column-level lineage is unaffected. - if dialect_of!(self is SnowflakeDialect | GenericDialect) + if dialect_of!(self is SnowflakeDialect | RedshiftSqlDialect | GenericDialect) && matches!(self.peek_token().token, Token::LParen) && matches!(self.peek_nth_token(1).token, Token::Plus) && matches!(self.peek_nth_token(2).token, Token::RParen) @@ -1247,6 +1265,44 @@ impl<'a> Parser<'a> { let start_idx = self.index; let next_token = self.next_token(); let expr = match next_token.token { + Token::Word(ref w_ref) + if w_ref.keyword == Keyword::NoKeyword + && w_ref.quote_style.is_none() + && w_ref.value.eq_ignore_ascii_case("DATE_PART") + && self.peek_token_is(&Token::LParen) + && dialect_of!(self is SnowflakeDialect | GenericDialect) => + { + // Snowflake DATE_PART supports ANSI EXTRACT-style syntax: + // DATE_PART( FROM ) + // alongside the function-call form `DATE_PART(, )`. + // https://docs.snowflake.com/en/sql-reference/functions/date_part + let name = ObjectName(vec![Ident::new("DATE_PART")]); + self.expect_token(&Token::LParen)?; + let part = self.parse_function_args()?; + let value = if self.consume_token(&Token::Comma) { + self.parse_function_args()? + } else { + self.expect_keyword(Keyword::FROM)?; + let expr = self.parse_expr()?; + FunctionArg::Unnamed(FunctionArgExpr::Expr(expr)) + }; + self.expect_token(&Token::RParen)?; + Ok(Expr::Function(Function { + name, + args: vec![part, value], + parameters: None, + over: None, + distinct: false, + approximate: false, + special: false, + order_by: vec![], + limit: None, + on_overflow: None, + null_treatment: None, + within_group: None, + having_bound: None, + })) + } Token::Word(w) => match w.keyword { Keyword::TRUE | Keyword::FALSE | Keyword::NULL => { self.prev_token(); @@ -1425,9 +1481,9 @@ impl<'a> Parser<'a> { } } - // Oracle/Snowflake legacy outer-join marker on a - // qualified column reference: `tbl.col (+)`. - if dialect_of!(self is SnowflakeDialect | GenericDialect) + // Oracle/Snowflake/Redshift legacy outer-join marker on + // a qualified column reference: `tbl.col (+)`. + if dialect_of!(self is SnowflakeDialect | RedshiftSqlDialect | GenericDialect) && matches!(self.peek_token().token, Token::LParen) && matches!(self.peek_nth_token(1).token, Token::Plus) && matches!(self.peek_nth_token(2).token, Token::RParen) @@ -2700,8 +2756,46 @@ impl<'a> Parser<'a> { /// First, we look for keywords that might be misread as interval expressions, /// Then, we check if an interval can be parsed. fn parse_interval_guard(&mut self) -> bool { + // When `INTERVAL` is followed by a binary-operator or clause-starter + // keyword, it can't be the start of an interval literal — it's a + // column name in dialects that accept it as an identifier (Snowflake, + // ClickHouse, etc.). The greedy `parse_interval` would otherwise + // consume the keyword as the literal's "value" (`parse_prefix` is + // permissive about treating bare keywords as identifiers) and break + // the surrounding clause: `WHERE INTERVAL BETWEEN x AND y`, + // `PARTITION BY INTERVAL ORDER BY ...`, `MAX(INTERVAL)`, etc. match self.peek_keywords() { - [Keyword::LIKE] | [Keyword::IS] => return false, + // binary operators + [Keyword::LIKE] + | [Keyword::ILIKE] + | [Keyword::IS] + | [Keyword::BETWEEN] + | [Keyword::AND] + | [Keyword::OR] + | [Keyword::XOR] + | [Keyword::IN] + | [Keyword::NOT] + // clause starters + | [Keyword::ORDER] + | [Keyword::GROUP] + | [Keyword::HAVING] + | [Keyword::WHERE] + | [Keyword::LIMIT] + | [Keyword::OFFSET] + | [Keyword::QUALIFY] + | [Keyword::WINDOW] + | [Keyword::UNION] + | [Keyword::INTERSECT] + | [Keyword::EXCEPT] + // window-frame & sort + | [Keyword::ROWS] + | [Keyword::RANGE] + | [Keyword::GROUPS] + | [Keyword::ASC] + | [Keyword::DESC] + // join conditions + | [Keyword::ON] + | [Keyword::USING] => return false, _ => {} } @@ -3326,6 +3420,17 @@ impl<'a> Parser<'a> { time_zone, }) } + // BigQuery accepts both single- and double-quoted + // strings as string literals; the time-zone arg + // is just a string. e.g. `AT TIME ZONE "Asia/Tokyo"`. + Token::DoubleQuotedString(time_zone) + if dialect_of!(self is BigQueryDialect | GenericDialect) => + { + Ok(Expr::AtTimeZone { + timestamp: Box::new(expr), + time_zone, + }) + } _ => self.expected( "Expected Token::SingleQuotedString after AT TIME ZONE", time_zone, @@ -4378,6 +4483,10 @@ impl<'a> Parser<'a> { | Token::LtEq | Token::Gt | Token::GtEq => false, + // `(col1, CLUSTER)` — the reserved keyword sits + // inside a parenthesised list with `)` after it. + // The `,` is a real separator, not trailing. + Token::RParen => false, _ => true, } } @@ -4829,6 +4938,33 @@ impl<'a> Parser<'a> { let schema_name = self.parse_schema_name()?; + // Snowflake zero-copy clone: + // CREATE SCHEMA new CLONE source + // [AT|BEFORE (TIMESTAMP|OFFSET|STATEMENT => )] + // https://docs.snowflake.com/en/sql-reference/sql/create-clone + // Lineage: the source schema's tables become the new schema's tables. + // Currently CreateSchema has no `clone` slot — consume the clause + // verbatim so the statement parses; revisit when the AST gains a + // schema-level clone field. + if dialect_of!(self is SnowflakeDialect | GenericDialect) + && self.parse_keyword(Keyword::CLONE) + { + let _source = self.parse_object_name(false)?; + // Optional time-travel suffix: AT|BEFORE (kind => expr). + if self.parse_one_of_keywords(&[Keyword::AT, Keyword::BEFORE]).is_some() { + self.expect_token(&Token::LParen)?; + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } + } + // Parse optional COMMENT clause (Snowflake, ClickHouse) let comment = if self.parse_keyword(Keyword::COMMENT) { let _ = self.consume_token(&Token::Eq); @@ -5290,6 +5426,13 @@ impl<'a> Parser<'a> { body.behavior = Some(FunctionBehavior::Volatile); } else if self.parse_keyword(Keyword::STRICT) { body.strict = true; + } else if self.parse_keywords(&[Keyword::NOT, Keyword::DETERMINISTIC]) + || self.parse_keyword(Keyword::DETERMINISTIC) + { + // BigQuery: `[NOT] DETERMINISTIC` marker. Doesn't change + // lineage; consume and discard. (No `body.deterministic` + // field — keeping the AST minimal.) + // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_a_function } else if self.parse_keywords(&[ Keyword::RETURNS, Keyword::NULL, @@ -5469,6 +5612,27 @@ impl<'a> Parser<'a> { } } + // Snowflake `CREATE EXTERNAL TABLE ... PARTITION BY (col, col, ...)` + // — the partition column list references columns already declared + // in the table's column-def list, so consuming opaquely preserves + // all lineage info. This precedes the options-swallowing block + // because the option-name-followed-by-`=` check below requires the + // first option to look like `name=value`, not `PARTITION BY (...)`. + // https://docs.snowflake.com/en/sql-reference/sql/create-external-table + if dialect_of!(self is SnowflakeDialect | GenericDialect) + && self.parse_keywords(&[Keyword::PARTITION, Keyword::BY]) + { + self.expect_token(&Token::LParen)?; + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } // Snowflake-style external table options: `LOCATION = @stage`, `PATTERN = '...'`, // `FILE_FORMAT = (...)`, `AUTO_REFRESH = ...`, etc. These are syntactically very // different from Hive's `LOCATION 'path'`, so when we see the Snowflake shape @@ -5517,7 +5681,15 @@ impl<'a> Parser<'a> { .build()); } + // Hive `CREATE [EXTERNAL] TABLE` accepts COMMENT, PARTITIONED BY, + // CLUSTERED BY ... INTO N BUCKETS, SKEWED BY, ROW FORMAT, STORED + // AS, LOCATION, TBLPROPERTIES in roughly this order. + // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL + // Consume the optional table-level COMMENT and CLUSTERED BY before + // the existing partition / formats parsers. + let _ = self.parse_optional_hive_comment_and_clustered_by()?; let hive_distribution = self.parse_hive_distribution()?; + let _ = self.parse_optional_hive_comment_and_clustered_by()?; let hive_formats = self.parse_hive_formats()?; let file_format = if let Some(ff) = &hive_formats.storage { @@ -5546,6 +5718,63 @@ impl<'a> Parser<'a> { .build()) } + /// Consume optional Hive table-level `COMMENT ''` and `CLUSTERED BY + /// (cols) [SORTED BY (cols)] INTO BUCKETS` clauses. Used by both + /// CREATE TABLE and CREATE EXTERNAL TABLE; lineage is preserved by the + /// surrounding column list. Returns nothing — the clauses are + /// consumed and discarded for parser-coverage. Both clauses can appear + /// in either order. + /// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL + fn parse_optional_hive_comment_and_clustered_by(&mut self) -> Result<(), ParserError> { + loop { + if self.parse_keyword(Keyword::COMMENT) { + let _ = self.parse_literal_string()?; + continue; + } + if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("CLUSTERED") + ) && matches!( + self.peek_nth_token(1).token, + Token::Word(w) if w.keyword == Keyword::BY + ) { + self.next_token(); // CLUSTERED + self.next_token(); // BY + self.expect_token(&Token::LParen)?; + let _ = self.parse_comma_separated(|p| p.parse_identifier(false))?; + self.expect_token(&Token::RParen)?; + if matches!(self.peek_token_kind(), Token::Word(w) if w.value.eq_ignore_ascii_case("SORTED")) + && matches!(self.peek_nth_token(1).token, Token::Word(w) if w.keyword == Keyword::BY) + { + self.next_token(); // SORTED + self.next_token(); // BY + self.expect_token(&Token::LParen)?; + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } + if self.parse_keyword(Keyword::INTO) { + let _ = self.parse_literal_uint()?; + if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("BUCKETS") + ) { + self.next_token(); + } + } + continue; + } + break; + } + Ok(()) + } + pub fn parse_file_format(&mut self) -> Result { let next_token = self.next_token(); match &next_token.token { @@ -6712,8 +6941,23 @@ impl<'a> Parser<'a> { let columns = self.parse_comma_separated(|p| p.parse_identifier(false))?; HiveDistributionStyle::PARTITIONED_NAMES { columns } } else { - let columns = self.parse_comma_separated(Parser::parse_column_def)?; - HiveDistributionStyle::PARTITIONED { columns } + // Athena Iceberg / Trino / BigQuery use expression-style + // partition specs: `PARTITIONED BY (col, BUCKET(16, id), + // TRUNCATE(8, name))`. Hive uses column-def specs: + // `PARTITIONED BY (year INT)`. Distinguish by peeking past + // the first identifier: if a known data-type keyword + // follows, it's the column-def form; otherwise expression. + // https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html + let is_expr_form = matches!(self.peek_token_kind(), Token::Word(w) + if !matches!(self.peek_nth_token(1).token, Token::Word(t) + if Self::is_data_type_keyword(t.keyword))); + if is_expr_form { + let _ = self.parse_comma_separated(|p| p.parse_expr())?; + HiveDistributionStyle::NONE + } else { + let columns = self.parse_comma_separated(Parser::parse_column_def)?; + HiveDistributionStyle::PARTITIONED { columns } + } }; self.expect_token(&Token::RParen)?; Ok(distribution) @@ -6722,6 +6966,62 @@ impl<'a> Parser<'a> { } } + fn is_data_type_keyword(kw: Keyword) -> bool { + matches!( + kw, + Keyword::INT + | Keyword::INT2 + | Keyword::INT4 + | Keyword::INT8 + | Keyword::INT16 + | Keyword::INT32 + | Keyword::INT64 + | Keyword::INT128 + | Keyword::INT256 + | Keyword::INTEGER + | Keyword::BIGINT + | Keyword::SMALLINT + | Keyword::TINYINT + | Keyword::MEDIUMINT + | Keyword::FLOAT + | Keyword::FLOAT4 + | Keyword::FLOAT8 + | Keyword::FLOAT32 + | Keyword::FLOAT64 + | Keyword::DOUBLE + | Keyword::REAL + | Keyword::DECIMAL + | Keyword::NUMERIC + | Keyword::DEC + | Keyword::BOOLEAN + | Keyword::BOOL + | Keyword::CHAR + | Keyword::CHARACTER + | Keyword::VARCHAR + | Keyword::NVARCHAR + | Keyword::NCHAR + | Keyword::STRING + | Keyword::TEXT + | Keyword::BLOB + | Keyword::BYTEA + | Keyword::BINARY + | Keyword::VARBINARY + | Keyword::DATE + | Keyword::TIME + | Keyword::TIMESTAMP + | Keyword::TIMESTAMPTZ + | Keyword::TIMETZ + | Keyword::INTERVAL + | Keyword::JSON + | Keyword::UUID + | Keyword::ARRAY + | Keyword::STRUCT + | Keyword::MAP + | Keyword::TUPLE + | Keyword::OBJECT + ) + } + pub fn parse_hive_formats(&mut self) -> Result { let mut hive_format = HiveFormat::default(); loop { @@ -6764,13 +7064,70 @@ impl<'a> Parser<'a> { pub fn parse_row_format(&mut self) -> Result { self.expect_keyword(Keyword::FORMAT)?; - match self.parse_one_of_keywords(&[Keyword::SERDE, Keyword::DELIMITED]) { + let format = match self.parse_one_of_keywords(&[Keyword::SERDE, Keyword::DELIMITED]) { Some(Keyword::SERDE) => { let class = self.parse_literal_string()?; - Ok(HiveRowFormat::SERDE { class }) + HiveRowFormat::SERDE { class } + } + _ => { + // Hive `ROW FORMAT DELIMITED [FIELDS|COLLECTION|MAP KEYS|LINES|NULL] …`. + // The DELIMITED suboptions (FIELDS TERMINATED BY '…', etc.) are + // dialect-specific punctuation with no lineage content; consume + // them opaquely until the next ROW / STORED / LOCATION / WITH + // / COMMENT / TBLPROPERTIES / PARTITIONED clause boundary. + // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-RowFormats&SerDe + loop { + let stop = match self.peek_token_kind() { + Token::Word(w) => match w.keyword { + Keyword::ROW + | Keyword::STORED + | Keyword::LOCATION + | Keyword::WITH + | Keyword::COMMENT + | Keyword::TBLPROPERTIES + | Keyword::PARTITIONED + | Keyword::AS => true, + _ => w.value.eq_ignore_ascii_case("CLUSTERED"), + }, + Token::EOF | Token::SemiColon => true, + _ => false, + }; + if stop { + break; + } + self.next_token(); + } + HiveRowFormat::DELIMITED + } + }; + // Hive `ROW FORMAT SERDE 'class' WITH SERDEPROPERTIES ('k'='v', ...)`. + // SERDEPROPERTIES isn't in our keyword list (matched case-insensitively). + // The properties are key/value strings with no lineage content; consume + // the balanced paren block and discard. + let saved = self.index; + let with_serde = self.parse_keyword(Keyword::WITH) + && matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("SERDEPROPERTIES") + ); + if with_serde { + self.next_token(); // SERDEPROPERTIES + self.expect_token(&Token::LParen)?; + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } } - _ => Ok(HiveRowFormat::DELIMITED), + } else { + // WITH wasn't followed by SERDEPROPERTIES — restore so other + // parsers (table options, CTEs, etc.) can take over. + self.index = saved; } + Ok(format) } fn parse_optional_on_cluster(&mut self) -> Result>, ParserError> { @@ -6919,49 +7276,47 @@ impl<'a> Parser<'a> { (None, None) }; - // Redshift allows specifying DISTSTYLE after column definitions - let dist_style = if self.parse_keywords(&[Keyword::DISTSTYLE]) { - match self.parse_one_of_keywords(&[ - Keyword::EVEN, - Keyword::ALL, - Keyword::AUTO, - Keyword::KEY, - ]) { - Some(Keyword::EVEN) => Some(DistributionStyle::Even), - Some(Keyword::ALL) => Some(DistributionStyle::All), - Some(Keyword::AUTO) => Some(DistributionStyle::Auto), - Some(Keyword::KEY) => Some(DistributionStyle::Key), - _ => self.expected("KEY, EVEN, ALL or AUTO", self.peek_token())?, + // Redshift CREATE TABLE accepts DISTSTYLE / DISTKEY / SORTKEY in any + // order after the column definitions. Loop to consume each at most + // once. + // https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html + let mut dist_style = None; + let mut dist_key = None; + let mut sort_key = None; + loop { + if dist_style.is_none() && self.parse_keyword(Keyword::DISTSTYLE) { + dist_style = match self.parse_one_of_keywords(&[ + Keyword::EVEN, + Keyword::ALL, + Keyword::AUTO, + Keyword::KEY, + ]) { + Some(Keyword::EVEN) => Some(DistributionStyle::Even), + Some(Keyword::ALL) => Some(DistributionStyle::All), + Some(Keyword::AUTO) => Some(DistributionStyle::Auto), + Some(Keyword::KEY) => Some(DistributionStyle::Key), + _ => self.expected("KEY, EVEN, ALL or AUTO", self.peek_token())?, + }; + } else if dist_key.is_none() && self.parse_keyword(Keyword::DISTKEY) { + self.expect_token(&Token::LParen)?; + let key = self.parse_identifier_or_number()?; + self.expect_token(&Token::RParen)?; + dist_key = Some(key); + } else if sort_key.is_none() + && (matches!(self.peek_token_kind(), Token::Word(w) if w.keyword == Keyword::SORTKEY) + || (matches!(self.peek_token_kind(), Token::Word(w) if w.keyword == Keyword::COMPOUND) + && matches!(self.peek_nth_token(1).token, Token::Word(w) if w.keyword == Keyword::SORTKEY))) + { + let compound = self.parse_keyword(Keyword::COMPOUND); + self.expect_keyword(Keyword::SORTKEY)?; + self.expect_token(&Token::LParen)?; + let columns = self.parse_comma_separated(|p| p.parse_identifier_or_number())?; + self.expect_token(&Token::RParen)?; + sort_key = Some(SortKey { compound, columns }); + } else { + break; } - } else { - None - }; - - // Redshift allows specifying DISTKEY after column definitions - // Column reference can be a name or a number (1-based column index) - let dist_key = if self.parse_keywords(&[Keyword::DISTKEY]) { - self.expect_token(&Token::LParen)?; - let key = self.parse_identifier_or_number()?; - self.expect_token(&Token::RParen)?; - Some(key) - } else { - None - }; - - // Redshift allows specifying SORTKEY after column definitions - // Column references can be names or numbers (1-based column index) - let compound_sort_key = self.parse_keywords(&[Keyword::COMPOUND]); - let sort_key = if self.parse_keywords(&[Keyword::SORTKEY]) { - self.expect_token(&Token::LParen)?; - let columns = self.parse_comma_separated(|p| p.parse_identifier_or_number())?; - self.expect_token(&Token::RParen)?; - Some(SortKey { - compound: compound_sort_key, - columns, - }) - } else { - None - }; + } // SQLite supports `WITHOUT ROWID` at the end of `CREATE TABLE` let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]); @@ -7466,6 +7821,26 @@ impl<'a> Parser<'a> { None }; + // ANSI / Postgres / Teradata: `CREATE TABLE … AS WITH [NO] + // DATA [AND [NO] STATISTICS]`. The clause controls whether the new + // table is populated with the query results and whether statistics + // are collected. Doesn't change lineage; consume and discard. + // - https://www.postgresql.org/docs/current/sql-createtableas.html + // - https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Data-Definition-Language-Syntax-and-Examples/Table-Statements/CREATE-TABLE-AS/AS-Subquery-Clause + let saved_idx = self.index; + if self.parse_keyword(Keyword::WITH) { + let _ = self.parse_keyword(Keyword::NO); + if self.parse_keyword(Keyword::DATA) { + if self.parse_keyword(Keyword::AND) { + let _ = self.parse_keyword(Keyword::NO); + let _ = self.parse_keyword(Keyword::STATISTICS); + } + } else { + // Not the WITH [NO] DATA shape — restore so other parsers + // (e.g. T-SQL `WITH (option=…)`) can take over. + self.index = saved_idx; + } + } // Snowflake: trailing `COPY GRANTS` after the `AS ` body. if !copy_grants && self.parse_keywords(&[Keyword::COPY, Keyword::GRANTS]) { copy_grants = true; @@ -7567,7 +7942,25 @@ impl<'a> Parser<'a> { } loop { - if let Some(projection) = if dialect_of!(self is ClickHouseDialect) { + // T-SQL system-versioned temporal tables: a table-level + // PERIOD FOR SYSTEM_TIME (start_col, end_col) + // pairs the row-start / row-end columns. The columns inside are + // already part of this table's column list, so the clause adds + // no new lineage; consume and discard. + // https://learn.microsoft.com/en-us/sql/relational-databases/tables/creating-a-system-versioned-temporal-table + if dialect_of!(self is MsSqlDialect | GenericDialect) + && matches!(self.peek_token_kind(), Token::Word(w) if w.keyword == Keyword::PERIOD) + && matches!(self.peek_nth_token(1).token, Token::Word(ref w) if w.keyword == Keyword::FOR) + { + self.next_token(); // PERIOD + self.next_token(); // FOR + self.expect_keyword(Keyword::SYSTEM_TIME)?; + self.expect_token(&Token::LParen)?; + let _ = self.parse_identifier(false)?; + self.expect_token(&Token::Comma)?; + let _ = self.parse_identifier(false)?; + self.expect_token(&Token::RParen)?; + } else if let Some(projection) = if dialect_of!(self is ClickHouseDialect) { self.parse_optional_table_projection()? } else { None @@ -7812,6 +8205,9 @@ impl<'a> Parser<'a> { let next_token = self.next_token(); match next_token.token { Token::SingleQuotedString(value, ..) => Ok(Some(ColumnOption::Comment(value))), + // Snowflake / Postgres dollar-quoted string for column COMMENT + // (`COMMENT $$some comment$$`). + Token::DollarQuotedString(s) => Ok(Some(ColumnOption::Comment(s.value))), _ => self.expected("string", next_token), } } else if self.parse_keyword(Keyword::NULL) { @@ -7848,7 +8244,13 @@ impl<'a> Parser<'a> { is_primary: false, characteristics, })) - } else if self.parse_keyword(Keyword::REFERENCES) { + } else if self.parse_keywords(&[Keyword::FOREIGN, Keyword::KEY, Keyword::REFERENCES]) + || self.parse_keyword(Keyword::REFERENCES) + { + // Snowflake column-level: ` [NOT NULL] FOREIGN KEY + // REFERENCES [()]`. Postgres / ANSI allow the + // shorter ` REFERENCES [()]`. + // https://docs.snowflake.com/en/sql-reference/sql/create-table let foreign_table = self.parse_object_name(false)?; // PostgreSQL allows omitting the column list and // uses the primary key column of the foreign table by default @@ -8009,6 +8411,69 @@ impl<'a> Parser<'a> { Ok(Some(ColumnOption::DialectSpecific(vec![ Token::make_keyword("SORTKEY"), ]))) + } else if dialect_of!(self is GenericDialect | AnsiDialect) + && self.parse_keyword(Keyword::FORMAT) + { + // Teradata column attribute: FORMAT ''. + // https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Data-Definition-Language-Detailed-Topics/CREATE-TABLE/Column-Level-Attributes-for-Database-Object-Creation + let _ = self.parse_literal_string()?; + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword("FORMAT"), + ]))) + } else if dialect_of!(self is GenericDialect | AnsiDialect) + && matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("TITLE") + ) + { + // Teradata column attribute: TITLE ''. + self.next_token(); + let _ = self.parse_literal_string()?; + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword("TITLE"), + ]))) + } else if dialect_of!(self is GenericDialect | AnsiDialect) + && matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("COMPRESS") + ) + { + // Teradata column attribute: COMPRESS [(value [, value]*)] — + // value-list compression. Consume the optional paren list + // opaquely; the values are constants, no lineage content. + self.next_token(); + if self.consume_token(&Token::LParen) { + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword("COMPRESS"), + ]))) + } else if dialect_of!(self is GenericDialect | AnsiDialect) + && matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("INLINE") + ) + && matches!( + self.peek_nth_token(1).token, + Token::Word(w) if w.value.eq_ignore_ascii_case("LENGTH") + ) + { + // Teradata column attribute: INLINE LENGTH <n>. + self.next_token(); // INLINE + self.next_token(); // LENGTH + let _ = self.parse_literal_uint()?; + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword("INLINE"), + Token::make_keyword("LENGTH"), + ]))) } else { Ok(None) } @@ -8017,11 +8482,7 @@ impl<'a> Parser<'a> { &mut self, ) -> Result<Option<ColumnOption>, ParserError> { if self.parse_keywords(&[Keyword::ALWAYS, Keyword::AS, Keyword::IDENTITY]) { - let mut sequence_options = vec![]; - if self.expect_token(&Token::LParen).is_ok() { - sequence_options = self.parse_create_sequence_options()?; - self.expect_token(&Token::RParen)?; - } + let sequence_options = self.parse_identity_paren_options()?; Ok(Some(ColumnOption::Generated { generated_as: GeneratedAs::Always, sequence_options: Some(sequence_options), @@ -8033,17 +8494,46 @@ impl<'a> Parser<'a> { Keyword::AS, Keyword::IDENTITY, ]) { - let mut sequence_options = vec![]; - if self.expect_token(&Token::LParen).is_ok() { - sequence_options = self.parse_create_sequence_options()?; - self.expect_token(&Token::RParen)?; - } + let sequence_options = self.parse_identity_paren_options()?; Ok(Some(ColumnOption::Generated { generated_as: GeneratedAs::ByDefault, sequence_options: Some(sequence_options), generation_expr: None, })) } else if self.parse_keywords(&[Keyword::ALWAYS, Keyword::AS]) { + // T-SQL system-versioned temporal table column markers: + // <col> DATETIME2 GENERATED ALWAYS AS ROW START [HIDDEN] + // <col> DATETIME2 GENERATED ALWAYS AS ROW END [HIDDEN] + // <col> GENERATED ALWAYS AS TRANSACTION_ID START [HIDDEN] + // <col> GENERATED ALWAYS AS TRANSACTION_ID END [HIDDEN] + // https://learn.microsoft.com/en-us/sql/relational-databases/tables/creating-a-system-versioned-temporal-table + // The marker doesn't carry an expression — the value is filled + // in by SQL Server itself. Surface as a `DialectSpecific` option + // so the column ref / type are preserved for lineage. + if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("ROW") + || w.value.eq_ignore_ascii_case("TRANSACTION_ID") + ) { + let kind = self.next_token().token; + let kind_word = match &kind { + Token::Word(w) => Token::make_word(&w.value.to_ascii_uppercase(), None), + _ => unreachable!(), + }; + let _ = self.parse_one_of_keywords(&[Keyword::START, Keyword::END]); + if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("HIDDEN") + ) { + self.next_token(); + } + return Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword("GENERATED"), + Token::make_keyword("ALWAYS"), + Token::make_keyword("AS"), + kind_word, + ]))); + } if self.expect_token(&Token::LParen).is_ok() { let expr = self.parse_expr()?; self.expect_token(&Token::RParen)?; @@ -9383,7 +9873,18 @@ impl<'a> Parser<'a> { next_token.span.start ), }, - Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())), + Token::SingleQuotedString(ref s) => { + // ANSI SQL / BigQuery / Postgres / Snowflake: adjacent string + // literals separated by whitespace are concatenated into a + // single literal. e.g. `'foo' 'bar'` parses as `'foobar'`. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + let mut value = s.to_string(); + while let Token::SingleQuotedString(ref next) = self.peek_token().token { + value.push_str(next); + self.next_token(); + } + Ok(Value::SingleQuotedString(value)) + } Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())), Token::DollarQuotedString(ref s) => Ok(Value::DollarQuotedString(s.clone())), Token::SingleQuotedByteStringLiteral(ref s) => { @@ -10348,6 +10849,46 @@ impl<'a> Parser<'a> { let mut idents = vec![]; loop { idents.push(self.parse_identifier(in_table_clause)?.unwrap()); + // BigQuery path expressions allow the last segment to start with a + // digit: `foo.bar.25ab`, `foo.bar.25_`, `foo.bar.25`. The + // tokenizer greedily folds the leading `.` into the number, so + // `bar.25ab` becomes `Word("bar")` then `Number(".25")` then + // `Word("ab")`. When we see that shape after a path segment, + // peel the leading `.` off the Number and treat the digit prefix + // (concatenated with any adjacent Word) as the next segment. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#path_expressions + if dialect_of!(self is BigQueryDialect | GenericDialect) { + let leading_dot_digits = match self.tokens.get(self.index) { + Some(t) => match &t.token { + Token::Number(s, false) + if s.starts_with('.') + && s.len() > 1 + && s[1..].chars().all(|c| c.is_ascii_digit()) => + { + Some(s[1..].to_string()) + } + _ => None, + }, + None => None, + }; + if let Some(digits) = leading_dot_digits { + self.index += 1; // consume the Number token + let mut value = digits; + if let Some(next) = self.tokens.get(self.index) { + if let Token::Word(w) = &next.token { + if w.quote_style.is_none() { + value.push_str(&w.value); + self.index += 1; + } + } + } + idents.push(Ident::new(value)); + if !self.consume_token(&Token::Period) { + break; + } + continue; + } + } if !self.consume_token(&Token::Period) { break; } @@ -10564,6 +11105,45 @@ impl<'a> Parser<'a> { in_table_clause: bool, ) -> Result<WithSpan<Ident>, ParserError> { let start_span = self.index; + + // Snowflake `IDENTIFIER(<value>)` lets any of the following stand in + // for an identifier: + // IDENTIFIER('<name>') -- string literal (most common) + // IDENTIFIER($foo) -- session variable + // IDENTIFIER(?) -- bind parameter + // https://docs.snowflake.com/en/sql-reference/identifier-literal + // The placeholder forms carry no compile-time name, so we surface a + // synthetic one (the placeholder text) just to keep parsing going — + // execution would resolve it at run time anyway. + if dialect_of!(self is SnowflakeDialect | GenericDialect) { + if let Token::Word(w) = &self.peek_token().token { + if w.quote_style.is_none() && w.value.eq_ignore_ascii_case("IDENTIFIER") { + let saved = self.index; + self.next_token(); // consume IDENTIFIER + if self.consume_token(&Token::LParen) { + let inner = self.next_token(); + let value = match inner.token { + Token::SingleQuotedString(s) => Some((s, Some('\''))), + Token::DoubleQuotedString(s) => Some((s, Some('"'))), + Token::Placeholder(p) => Some((p, None)), + _ => None, + }; + if let Some((s, q)) = value { + if self.consume_token(&Token::RParen) { + let ident = match q { + Some(quote) => Ident::with_quote(quote, s), + None => Ident::new(s), + }; + return Ok(ident.spanning(self.span_from_index(start_span))); + } + } + } + // Not the IDENTIFIER(<value>) shape — back up and parse normally. + self.index = saved; + } + } + } + let next_token = self.next_token(); match next_token.token { Token::Word(w) => { @@ -11157,6 +11737,12 @@ impl<'a> Parser<'a> { }; let from = self.parse_comma_separated(Parser::parse_table_and_joins)?; + // ClickHouse distributed-DDL clause: `DELETE FROM tbl ON CLUSTER <name> + // WHERE …`. Doesn't change the lineage shape; consume and discard. + // https://clickhouse.com/docs/sql-reference/distributed-ddl + if dialect_of!(self is ClickHouseDialect | GenericDialect) { + let _ = self.parse_optional_on_cluster()?; + } let using = if self.parse_keyword(Keyword::USING) { Some(self.parse_comma_separated(Parser::parse_table_and_joins)?) } else { @@ -11938,7 +12524,7 @@ impl<'a> Parser<'a> { } pub fn parse_set_quantifier(&mut self, op: &Option<SetOperator>) -> SetQuantifier { - match op { + let q = match op { Some(SetOperator::Union) => { if self.parse_keywords(&[Keyword::BY, Keyword::NAME]) { SetQuantifier::ByName @@ -11970,7 +12556,62 @@ impl<'a> Parser<'a> { } } _ => SetQuantifier::None, + }; + // ANSI SQL / BigQuery set-operator suffixes — accept in any order: + // CORRESPONDING [BY (col, col, …)] -- match legs by column name + // STRICT -- type-strict union + // ON (col, col, …) -- BigQuery `BY NAME ON (cols)` + // All are opaque to lineage (the column names inside already + // appear in the SELECT legs); consume balanced paren blocks and + // discard. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#set_operators + loop { + let advanced = if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("CORRESPONDING") + ) { + self.next_token(); + if self.parse_keyword(Keyword::BY) && self.consume_token(&Token::LParen) { + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } + true + } else if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("STRICT") + ) { + self.next_token(); + true + } else if matches!(self.peek_token_kind(), Token::Word(w) if w.keyword == Keyword::ON) + && matches!(self.peek_nth_token(1).token, Token::LParen) + { + self.next_token(); // ON + self.next_token(); // ( + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + true + } else { + false + }; + if !advanced { + break; + } } + q } /// Parse a restricted `SELECT` statement (no CTEs / `UNION` / `ORDER BY`), @@ -12803,11 +13444,80 @@ impl<'a> Parser<'a> { } let _ = kw; // kw discarded; preserved name not needed in AST. } + // DuckDB `USING SAMPLE` clause: + // tbl USING SAMPLE 10% + // tbl USING SAMPLE 10 ROWS + // tbl USING SAMPLE SYSTEM (10 PERCENT) + // tbl USING SAMPLE RESERVOIR (50 ROWS) REPEATABLE (100) + // https://duckdb.org/docs/sql/samples + // The clause is opaque to lineage (no table/column refs inside); + // count parens / consume balanced tokens and discard. We only enter + // this branch when `USING` is *not* followed by `(` (which would be + // a JOIN's USING (col, ...) constraint). + if dialect_of!(self is DuckDbDialect | GenericDialect) + && matches!(self.peek_token_kind(), Token::Word(w) if w.keyword == Keyword::USING) + && matches!( + self.peek_nth_token(1).token, + Token::Word(w) if w.value.eq_ignore_ascii_case("SAMPLE") + ) + { + self.next_token(); // USING + self.next_token(); // SAMPLE + // Skip an optional method keyword (SYSTEM | RESERVOIR | BERNOULLI). + if let Token::Word(w) = self.peek_token_kind() { + if matches!( + w.value.to_ascii_uppercase().as_str(), + "SYSTEM" | "RESERVOIR" | "BERNOULLI" + ) { + self.next_token(); + } + } + // The sample size is either a bare `<n>[%]` / `<n> ROWS` or a + // parenthesised group `(<n> [PERCENT|ROWS])`. Then optional + // `REPEATABLE (<seed>)`. Consume by paren-balance / linear + // skip up to a JOIN/clause keyword or end of FROM list. + loop { + match self.peek_token_kind() { + Token::LParen => { + self.next_token(); + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } + Token::Number(_, _) | Token::Mod => { + self.next_token(); + } + Token::Word(w) + if matches!( + w.value.to_ascii_uppercase().as_str(), + "ROWS" | "PERCENT" | "REPEATABLE" + ) => + { + self.next_token(); + } + _ => break, + } + } + } + // Note that for keywords to be properly handled here, they need to be // added to `RESERVED_FOR_TABLE_ALIAS`, otherwise they may be parsed as // a table alias. let mut joins = vec![]; loop { + // ClickHouse `GLOBAL` is a distributed-query modifier prefixing + // any JOIN type. It doesn't change the join shape; consume and + // ignore. + // https://clickhouse.com/docs/sql-reference/statements/select/join + if dialect_of!(self is ClickHouseDialect | GenericDialect) { + let _ = self.parse_keyword(Keyword::GLOBAL); + } let join = if self.parse_keyword(Keyword::CROSS) { let join_operator = if self.parse_keyword(Keyword::JOIN) { JoinOperator::CrossJoin @@ -12892,6 +13602,14 @@ impl<'a> Parser<'a> { } } let _ = self.parse_keyword(Keyword::INNER); // [ INNER ] + // ClickHouse: `INNER [ANY|ASOF|ALL] JOIN` + if dialect_of!(self is ClickHouseDialect | GenericDialect) { + let _ = self.parse_one_of_keywords(&[ + Keyword::ANY, + Keyword::ALL, + Keyword::ASOF, + ]); + } self.expect_keyword(Keyword::JOIN)?; JoinOperator::Inner } @@ -12915,6 +13633,19 @@ impl<'a> Parser<'a> { } let _ = self.next_token(); // consume LEFT/RIGHT let is_left = kw == Keyword::LEFT; + // ClickHouse modifiers: `LEFT [ANY|ASOF|ALL] JOIN` and + // similarly for RIGHT/INNER. Consume and treat as the + // base outer join — the modifier doesn't change + // lineage shape (table refs and join condition are the + // same), so the AST loss is acceptable. + // https://clickhouse.com/docs/sql-reference/statements/select/join + if dialect_of!(self is ClickHouseDialect | GenericDialect) { + let _ = self.parse_one_of_keywords(&[ + Keyword::ANY, + Keyword::ALL, + Keyword::ASOF, + ]); + } let join_type = self.parse_one_of_keywords(&[ Keyword::OUTER, Keyword::SEMI, @@ -13064,6 +13795,61 @@ impl<'a> Parser<'a> { /// A table name or a parenthesized subquery, followed by optional `[AS] alias` pub fn parse_table_factor(&mut self) -> Result<TableFactor, ParserError> { + // BigQuery legacy SQL bracket-quoted table identifier: + // FROM [project-id:dataset.table] + // The Standard SQL equivalent uses backticks: + // FROM `project-id.dataset.table` + // https://cloud.google.com/bigquery/docs/reference/legacy-sql + // Customers still submit legacy-SQL queries through warehouses that + // accept them. Capture the inner identifier as a single backticked + // ObjectName segment so lineage tracks the table reference. + if dialect_of!(self is BigQueryDialect | GenericDialect) + && self.peek_token_is(&Token::LBracket) + { + self.expect_token(&Token::LBracket)?; + // Concatenate every token inside the brackets into a single + // string. Tokens that aren't simple words / colons / dots / + // hyphens / digits would indicate something other than a legacy + // table identifier; bail and leave the position unchanged so + // other callers (e.g. array-literal contexts) can take over. + let saved = self.index - 1; // before LBracket + let mut name = String::new(); + let mut ok = true; + loop { + let token = self.next_token().token; + match token { + Token::RBracket => break, + Token::Word(w) => name.push_str(&w.value), + Token::Number(n, _) => name.push_str(&n), + Token::Period => name.push('.'), + Token::Colon => name.push(':'), + Token::Minus => name.push('-'), + Token::Mul => name.push('*'), + Token::EOF => { + ok = false; + break; + } + _ => { + ok = false; + break; + } + } + } + if ok && !name.is_empty() { + let alias = + self.parse_optional_table_alias(keywords::RESERVED_FOR_TABLE_ALIAS)?; + return Ok(TableFactor::Table { + name: ObjectName(vec![Ident::with_quote('`', name)]), + alias, + args: None, + with_hints: vec![], + version: None, + partitions: vec![], + with_ordinality: false, + }); + } + self.index = saved; + } // Databricks `FROM STREAM table_name` streaming read modifier. // The STREAM keyword marks the source as a streaming read; it is not // preserved in the AST since the underlying table reference carries @@ -13168,7 +13954,27 @@ impl<'a> Parser<'a> { let expr = self.parse_expr()?; self.expect_token(&Token::RParen)?; let alias = self.parse_optional_table_alias(keywords::RESERVED_FOR_TABLE_ALIAS)?; - Ok(TableFactor::TableFunction { expr, alias }) + let mut table = TableFactor::TableFunction { expr, alias }; + // Snowflake allows TABLESAMPLE after a TABLE(<expr>) reference: + // FROM TABLE('t1') TABLESAMPLE BERNOULLI (20.3) + // Loop through the same suffix keywords as plain table refs. + while let Some(kw) = self.parse_one_of_keywords(&[ + Keyword::PIVOT, + Keyword::UNPIVOT, + Keyword::TABLESAMPLE, + Keyword::SAMPLE, + Keyword::MATCH_RECOGNIZE, + ]) { + table = match kw { + Keyword::PIVOT => self.parse_pivot_table_factor(table)?, + Keyword::UNPIVOT => self.parse_unpivot_table_factor(table)?, + Keyword::TABLESAMPLE => self.parse_tablesample_table_factor(table, false)?, + Keyword::SAMPLE => self.parse_tablesample_table_factor(table, true)?, + Keyword::MATCH_RECOGNIZE => self.parse_match_recognize_table_factor(table)?, + _ => unreachable!(), + }; + } + Ok(table) } else if self.consume_token(&Token::LParen) { // A left paren introduces either a derived table (i.e., a subquery) // or a nested join. It's nearly impossible to determine ahead of @@ -13441,8 +14247,10 @@ impl<'a> Parser<'a> { vec![] }; - // Parse potential version qualifier - let version = self.parse_table_version()?; + // Parse potential version qualifier (Snowflake CHANGES, etc.) + // BigQuery/MSSQL `FOR SYSTEM_TIME AS OF` is also legal here, but + // the more common position is *after* the alias — handle both. + let mut version = self.parse_table_version()?; // Postgres, MSSQL: table-valued functions: let args = if self.consume_token(&Token::LParen) { @@ -13457,6 +14265,13 @@ impl<'a> Parser<'a> { let alias = self.parse_optional_table_alias(keywords::RESERVED_FOR_TABLE_ALIAS)?; + // BigQuery / MSSQL: `FROM tbl [alias] FOR SYSTEM_TIME AS OF expr` + // (the FOR clause typically follows the alias). + // https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#for_system_time_as_of + if version.is_none() { + version = self.parse_table_version()?; + } + // ClickHouse: SELECT ... FROM table [AS alias] FINAL // Skip FINAL keyword (doesn't affect lineage) if dialect_of!(self is ClickHouseDialect) { @@ -15044,6 +15859,31 @@ impl<'a> Parser<'a> { { let _alias = self.parse_identifier(false)?; } + // MySQL / Oracle / Postgres `JSON_TABLE` and `XMLTABLE` + // attach a `COLUMNS(<col_defs>)` clause to a path + // argument, defining the output row shape: + // JSON_TABLE(json, '$.path' COLUMNS(id INT PATH '$.id')) + // The clause defines output columns (no input table / + // column refs), so consume the balanced paren block + // opaquely. The path expression in `expr` is preserved. + // https://dev.mysql.com/doc/refman/8.4/en/json-table-functions.html + if matches!( + self.peek_token_kind(), + Token::Word(w) if w.value.eq_ignore_ascii_case("COLUMNS") + ) && matches!(self.peek_nth_token(1).token, Token::LParen) + { + self.next_token(); // COLUMNS + self.next_token(); // ( + let mut depth = 1i32; + while depth > 0 { + match self.next_token().token { + Token::LParen => depth += 1, + Token::RParen => depth -= 1, + Token::EOF => break, + _ => {} + } + } + } FunctionArgExpr::Expr(expr) } }; @@ -16744,6 +17584,30 @@ impl<'a> Parser<'a> { self.parse_number_value() } + /// Parse the optional paren block after `GENERATED [ALWAYS|BY DEFAULT] AS + /// IDENTITY`. Two shapes are accepted: + /// - `(START WITH n INCREMENT BY n …)` — Postgres-style keyword options. + /// - `(<seed>, <step>)` — Redshift two-arg shorthand. + /// https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html + fn parse_identity_paren_options(&mut self) -> Result<Vec<SequenceOptions>, ParserError> { + let mut sequence_options = vec![]; + if self.expect_token(&Token::LParen).is_ok() { + // Detect Redshift `(seed, step)` shorthand by peeking for a + // bare number before any keyword. + if matches!(self.peek_token_kind(), Token::Number(_, _) | Token::Minus | Token::Plus) { + let seed = Expr::Value(self.parse_sequence_number_value()?); + self.expect_token(&Token::Comma)?; + let step = Expr::Value(self.parse_sequence_number_value()?); + sequence_options.push(SequenceOptions::StartWith(seed, false)); + sequence_options.push(SequenceOptions::IncrementBy(step, false)); + } else { + sequence_options = self.parse_create_sequence_options()?; + } + self.expect_token(&Token::RParen)?; + } + Ok(sequence_options) + } + fn parse_create_sequence_options(&mut self) -> Result<Vec<SequenceOptions>, ParserError> { let mut sequence_options = vec![]; loop { @@ -16808,6 +17672,13 @@ impl<'a> Parser<'a> { sequence_options.push(SequenceOptions::OrderBy(true)); } else if self.parse_keyword(Keyword::NOORDER) { sequence_options.push(SequenceOptions::OrderBy(false)); + } + // COMMENT = '<string>' (Snowflake): user-provided sequence comment. + // https://docs.snowflake.com/en/sql-reference/sql/create-sequence + // No lineage content; consume the value and discard. + else if self.parse_keyword(Keyword::COMMENT) { + let _ = self.consume_token(&Token::Eq); + let _ = self.parse_literal_string()?; } else { break; } diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index 7ddca119a9..d49df3f5b9 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -896,6 +896,26 @@ fn parse_table_identifiers() { test_table_ident_err("bar-"); } +#[test] +fn parse_bigquery_path_segment_starts_with_digit() { + // BigQuery path expressions allow the last segment to start with a digit + // (e.g. `foo.bar.25ab`, `foo.bar.25`, `foo.bar.25_`). The tokenizer + // greedily folds the leading `.` into the number; `parse_object_name` + // peels it back off. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#path_expressions + let cases = [ + "SELECT * FROM foo.bar.25ab c", + "SELECT * FROM foo.bar.25", + "SELECT * FROM foo.bar.25x a", + "SELECT * FROM foo.bar.25_", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + #[test] fn parse_hyphenated_table_identifiers() { bigquery().one_statement_parses_to( @@ -2454,3 +2474,134 @@ fn test_bigquery_raw_string_escaped_quote() { .parse_sql_statements(r#"SELECT r"escaped \" quote stays in string""#) .unwrap(); } + +#[test] +fn parse_bigquery_union_corresponding() { + // BigQuery / ANSI SQL set-operator suffixes: CORRESPONDING [BY (cols)] + // matches legs by column name; STRICT enforces type-strict union. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#set_operators + let cases = [ + "SELECT 1 AS x UNION ALL CORRESPONDING SELECT 2 AS x", + "SELECT 1 AS x UNION ALL CORRESPONDING BY (foo, bar) SELECT 2 AS x", + "SELECT 1 AS x LEFT UNION ALL CORRESPONDING SELECT 2 AS x", + "SELECT 1 UNION ALL STRICT SELECT 2", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_union_strict_corresponding_on() { + // BigQuery extends UNION/INTERSECT/EXCEPT with three optional suffixes + // that can appear in any order: + // STRICT + // CORRESPONDING [BY (cols)] + // ON (cols) -- after BY NAME + // https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#set_operators + let cases = [ + "SELECT 1 AS x UNION ALL STRICT CORRESPONDING SELECT 2 AS x", + "SELECT 1 AS x UNION ALL STRICT CORRESPONDING BY (foo, bar) SELECT 2 AS x", + "SELECT 1 AS x INNER UNION ALL BY NAME ON (foo, bar) SELECT 2 AS x", + "SELECT 1 AS x UNION ALL BY NAME ON (foo, bar) SELECT 2 AS x", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_for_system_time_after_alias() { + // BigQuery / MSSQL `FOR SYSTEM_TIME AS OF <expr>` time-travel reads + // can appear after an optional alias. The previous parser only + // accepted the clause before the alias, so an aliased shape + // ("FROM tbl t FOR SYSTEM_TIME AS OF …") fell through to the + // FOR-UPDATE locks loop and errored. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#for_system_time_as_of + let cases = [ + "SELECT * FROM tbl AS t FOR SYSTEM_TIME AS OF CURRENT_TIMESTAMP()", + "SELECT * FROM tbl t FOR SYSTEM_TIME AS OF CURRENT_TIMESTAMP() LEFT JOIN other o ON t.id = o.id", + "SELECT * FROM tbl FOR SYSTEM_TIME AS OF '2025-01-01'", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_reserved_keyword_in_parenthesised_list() { + // BigQuery wildcards with EXCEPT can name reserved-as-alias keywords: + // SELECT * EXCEPT(id, CLUSTER, MONTH) FROM t + // Previously the trailing-comma terminator misread `, CLUSTER )` as a + // trailing comma + clause keyword, leaving CLUSTER unconsumed. + let cases = [ + "SELECT * EXCEPT(id_2, CLUSTER) FROM tbl", + "SELECT * EXCEPT(MONTH, CLUSTER) FROM tbl", + "SELECT t.* EXCEPT(id, CLUSTER, MONTH) FROM tbl t", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_create_function_deterministic_marker() { + // BigQuery's CREATE FUNCTION accepts an optional `[NOT] DETERMINISTIC` + // marker between RETURNS and LANGUAGE. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_a_function + let cases = [ + "CREATE TEMPORARY FUNCTION a(x FLOAT64, y FLOAT64) RETURNS FLOAT64 NOT DETERMINISTIC LANGUAGE js AS 'return x*y;'", + "CREATE TEMPORARY FUNCTION a(x FLOAT64) RETURNS FLOAT64 DETERMINISTIC LANGUAGE js AS 'return x;'", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_legacy_bracket_table_ref() { + // BigQuery legacy SQL bracket-quoted table identifier: + // FROM [project-id:dataset.table] + // Standard SQL replaces these with backticks, but customers still send + // legacy-SQL queries through the wire. Inside the brackets we capture + // any project/dataset/table chars as a single backticked ObjectName. + // https://cloud.google.com/bigquery/docs/reference/legacy-sql + let cases = [ + "SELECT * FROM [my-proj-123:dataset.table]", + "SELECT * FROM [proj:ds.tbl] AS t", + "SELECT * FROM [a-b-c:d.e] WHERE x = 1", + ]; + for sql in cases { + bigquery() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_bigquery_at_time_zone_double_quoted_string() { + // BigQuery accepts both single- and double-quoted string literals, + // and `AT TIME ZONE` takes a string. The previous parser only + // recognized single-quoted form, so `EXTRACT(HOUR FROM ts AT TIME + // ZONE "Asia/Tokyo")` failed. + bigquery() + .parse_sql_statements("SELECT EXTRACT(HOUR FROM ts AT TIME ZONE \"Asia/Tokyo\")") + .unwrap(); + bigquery() + .parse_sql_statements("SELECT ts AT TIME ZONE \"UTC\"") + .unwrap(); + // Single-quoted form still parses everywhere. + bigquery() + .parse_sql_statements("SELECT ts AT TIME ZONE 'UTC'") + .unwrap(); +} diff --git a/tests/sqlparser_clickhouse.rs b/tests/sqlparser_clickhouse.rs index 13eb789667..370d5f6186 100644 --- a/tests/sqlparser_clickhouse.rs +++ b/tests/sqlparser_clickhouse.rs @@ -1838,3 +1838,41 @@ fn parse_grant_on_wildcard() { clickhouse_and_generic().verified_stmt("GRANT SELECT ON mydb.* TO john"); clickhouse_and_generic().verified_stmt("GRANT IMPERSONATE ON * TO user3"); } + +#[test] +fn parse_clickhouse_any_asof_global_joins() { + // ClickHouse JOIN modifiers: ANY, ASOF, ALL after [INNER|LEFT|RIGHT], + // and the GLOBAL distributed-query prefix on any JOIN type. + // https://clickhouse.com/docs/sql-reference/statements/select/join + let cases = [ + "SELECT * FROM foo LEFT ANY JOIN bla", + "SELECT * FROM foo LEFT ASOF JOIN bla", + "SELECT * FROM foo LEFT ALL JOIN bla", + "SELECT * FROM foo GLOBAL LEFT ANY JOIN bla ON foo.c1 = bla.c2", + "SELECT * FROM foo GLOBAL JOIN bla ON foo.c1 = bla.c2", + "SELECT * FROM foo INNER ANY JOIN bla USING (c)", + "SELECT * FROM foo RIGHT ASOF JOIN bla ON foo.c1 = bla.c2", + ]; + for sql in cases { + clickhouse() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_clickhouse_delete_on_cluster() { + // ClickHouse distributed-DDL: `DELETE FROM tbl ON CLUSTER <name> WHERE …`. + // The ON CLUSTER clause routes the statement to all shards. + // https://clickhouse.com/docs/sql-reference/distributed-ddl + clickhouse() + .parse_sql_statements("DELETE FROM tbl ON CLUSTER test_cluster WHERE date = '2019-01-01'") + .unwrap(); + clickhouse() + .parse_sql_statements("DELETE FROM tbl ON CLUSTER '{cluster}' WHERE date = '2019-01-01'") + .unwrap(); + // Plain DELETE still parses. + clickhouse() + .parse_sql_statements("DELETE FROM tbl WHERE date = '2019-01-01'") + .unwrap(); +} diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 7580d1c8f8..e6feb75527 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -9189,3 +9189,24 @@ fn interval_identifier() { ) .unwrap(); } + +#[test] +fn parse_teradata_column_attributes() { + // Teradata column-level attributes: FORMAT, TITLE, COMPRESS, INLINE LENGTH. + // https://docs.teradata.com/r/Enterprise_IntelliFlex_VMware/SQL-Data-Definition-Language-Detailed-Topics/CREATE-TABLE/Column-Level-Attributes-for-Database-Object-Creation + // Routed through GenericDialect / AnsiDialect (no dedicated Teradata dialect). + let cases = [ + "CREATE TABLE db.foo (id INT NOT NULL, valid_date DATE FORMAT 'YYYY-MM-DD', measurement INT COMPRESS)", + "CREATE TABLE foo (baz DATE FORMAT 'YYYY/MM/DD' TITLE 'title' INLINE LENGTH 1 COMPRESS ('a', 'b'))", + "CREATE TABLE db.foo (id INT NOT NULL, valid_date DATE FORMAT 'YYYY-MM-DD' COMPRESS (CAST('9999-09-09' AS DATE)), measurement INT)", + ]; + let dialects = TestedDialects { + dialects: vec![Box::new(GenericDialect {}), Box::new(AnsiDialect {})], + options: None, + }; + for sql in cases { + dialects + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} diff --git a/tests/sqlparser_duckdb.rs b/tests/sqlparser_duckdb.rs index fafe6b58de..1b238e4eda 100644 --- a/tests/sqlparser_duckdb.rs +++ b/tests/sqlparser_duckdb.rs @@ -508,3 +508,31 @@ fn test_escaped_string_literal() { duckdb().verified_only_select(r"SELECT E'\n'"); duckdb().one_statement_parses_to(r"SELECT e'\n'", r"SELECT E'\n'"); } + +#[test] +fn parse_duckdb_using_sample() { + // DuckDB's `USING SAMPLE` clause attaches a sample method to a FROM + // table reference. The clause is opaque to lineage — no table/column + // refs inside — but we must accept it without consuming the rest of + // the query. + // https://duckdb.org/docs/sql/samples + let cases = [ + "SELECT * FROM tbl USING SAMPLE 10%", + "SELECT * FROM tbl USING SAMPLE 10 ROWS", + "SELECT * FROM tbl USING SAMPLE SYSTEM (10 PERCENT)", + "SELECT * FROM tbl USING SAMPLE SYSTEM (10 PERCENT) REPEATABLE (377)", + "SELECT * FROM tbl USING SAMPLE RESERVOIR (50 ROWS) REPEATABLE (100)", + "SELECT * FROM tbl USING SAMPLE BERNOULLI (5 PERCENT)", + ]; + for sql in cases { + duckdb() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } + + // JOIN's USING (col) constraint must still parse normally — the new + // path only triggers when USING is followed by SAMPLE, not `(`. + duckdb() + .parse_sql_statements("SELECT * FROM a JOIN b USING (id)") + .unwrap(); +} diff --git a/tests/sqlparser_hive.rs b/tests/sqlparser_hive.rs index 86fba2bf38..7ce998a952 100644 --- a/tests/sqlparser_hive.rs +++ b/tests/sqlparser_hive.rs @@ -500,9 +500,71 @@ fn parse_similar_to() { chk(true); } +#[test] +fn parse_hive_row_format_serde_with_serdeproperties() { + // Hive `ROW FORMAT SERDE 'class' WITH SERDEPROPERTIES ('k'='v', …)`. + // SERDEPROPERTIES is not a Keyword; matched case-insensitively. + // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-RowFormats&SerDe + let cases = [ + "CREATE EXTERNAL TABLE foo (a INT, b STRING) \ + ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' \ + WITH SERDEPROPERTIES ('case.insensitive'='FALSE') \ + LOCATION 's3://table/path'", + "CREATE EXTERNAL TABLE x (y INT) \ + ROW FORMAT SERDE 'serde' \ + ROW FORMAT DELIMITED FIELDS TERMINATED BY '1' \ + WITH SERDEPROPERTIES ('input.regex'='')", + ]; + for sql in cases { + hive() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + fn hive() -> TestedDialects { TestedDialects { dialects: vec![Box::new(HiveDialect {})], options: None, } } + +#[test] +fn parse_iceberg_partitioned_by_with_transforms() { + // Athena Iceberg / Trino-style PARTITIONED BY clauses use expressions + // (column refs + transform functions) instead of Hive-style column + // definitions. The parser auto-detects which form by peeking the + // second token after the first identifier. + // https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html + let cases = [ + "CREATE TABLE t (`id` BIGINT, category STRING) PARTITIONED BY (category, BUCKET(16, id))", + "CREATE TABLE t (id BIGINT, ts TIMESTAMP) PARTITIONED BY (TRUNCATE(8, id), DAY(ts))", + // Hive column-def form still parses + "CREATE TABLE t (a INT) PARTITIONED BY (year INT)", + ]; + for sql in cases { + hive() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_hive_create_table_comment_and_clustered_by() { + // Hive `CREATE [EXTERNAL] TABLE … COMMENT '<str>' [CLUSTERED BY (cols) + // [SORTED BY (cols)] INTO <n> BUCKETS]`. Both clauses are optional and + // can appear in either order before / between PARTITIONED BY and the + // ROW FORMAT / STORED AS / LOCATION block. + // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL + let cases = [ + "CREATE EXTERNAL TABLE foo (id INT) COMMENT 'test comment'", + "CREATE EXTERNAL TABLE foo (id INT, val STRING) CLUSTERED BY (id, val) INTO 10 BUCKETS", + "CREATE EXTERNAL TABLE foo (id INT) COMMENT 'c' PARTITIONED BY (a INT) CLUSTERED BY (id) INTO 5 BUCKETS", + "CREATE EXTERNAL TABLE foo (id INT) CLUSTERED BY (id) SORTED BY (id ASC) INTO 8 BUCKETS", + ]; + for sql in cases { + hive() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} diff --git a/tests/sqlparser_mssql.rs b/tests/sqlparser_mssql.rs index 272933cecc..cd3e271830 100644 --- a/tests/sqlparser_mssql.rs +++ b/tests/sqlparser_mssql.rs @@ -707,3 +707,28 @@ fn parse_mssql_option_query_hint() { ms().parse_sql_statements("SELECT * FROM t OPTION (FORCE ORDER, MAXDOP 4)") .unwrap(); } + +#[test] +fn parse_mssql_temporal_table() { + // T-SQL system-versioned temporal table syntax: + // CREATE TABLE t ( + // <cols>, + // <start_col> DATETIME2 GENERATED ALWAYS AS ROW START [HIDDEN] [NOT NULL], + // <end_col> DATETIME2 GENERATED ALWAYS AS ROW END [HIDDEN] [NOT NULL], + // PERIOD FOR SYSTEM_TIME (<start_col>, <end_col>) + // ) [WITH (SYSTEM_VERSIONING = ON [(...)])] + // https://learn.microsoft.com/en-us/sql/relational-databases/tables/creating-a-system-versioned-temporal-table + let cases = [ + "CREATE TABLE test (a INT, b DATETIME2(2) GENERATED ALWAYS AS ROW START NOT NULL, \ + c DATETIME2(2) GENERATED ALWAYS AS ROW END NOT NULL, \ + PERIOD FOR SYSTEM_TIME (b, c))", + "CREATE TABLE test (a INT, b DATETIME2(2) GENERATED ALWAYS AS ROW START HIDDEN NOT NULL, \ + c DATETIME2(2) GENERATED ALWAYS AS ROW END HIDDEN NOT NULL, \ + PERIOD FOR SYSTEM_TIME (b, c)) WITH(SYSTEM_VERSIONING=ON)", + ]; + for sql in cases { + ms() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 1de948edcb..f570646eb0 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -2077,3 +2077,38 @@ fn parse_mysql_index_hints() { .parse_sql_statements("SELECT * FROM t FORCE INDEX (idx)") .unwrap(); } + +#[test] +fn parse_replace_into_statement() { + // MySQL `REPLACE [INTO]` is INSERT-with-replace semantics — delete on + // primary-key conflict and insert the new row. Same shape as INSERT + // INTO, just different leading verb. + // https://dev.mysql.com/doc/refman/8.4/en/replace.html + mysql() + .parse_sql_statements("REPLACE INTO mytable SELECT id FROM other WHERE cnt > 100") + .unwrap(); + mysql() + .parse_sql_statements("REPLACE INTO t (a, b) VALUES (1, 2)") + .unwrap(); +} + +#[test] +fn parse_json_table_with_columns_clause() { + // MySQL JSON_TABLE function attaches a `COLUMNS(<col_defs>)` clause + // to a path argument, defining the output row shape. The columns + // clause is opaque to lineage (no input refs); the path expression + // is preserved for grammar coverage. + // https://dev.mysql.com/doc/refman/8.4/en/json-table-functions.html + mysql() + .parse_sql_statements( + "SELECT * FROM t, JSON_TABLE(t.j, '$[*]' \ + COLUMNS(id INT PATH '$.id', name VARCHAR(255) PATH '$.name')) AS q", + ) + .unwrap(); + mysql() + .parse_sql_statements( + "SELECT * FROM JSON_TABLE(j, '$.org[*]' \ + COLUMNS(row_id FOR ORDINALITY, link VARCHAR(255) PATH '$.link')) AS links", + ) + .unwrap(); +} diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 838e168b85..0d94702ee8 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -4583,3 +4583,23 @@ fn test_postgres_for_update_not_an_alias() { pg().parse_sql_statements("SELECT * FROM (SELECT * FROM mytable FOR UPDATE) ss WHERE col1 = 5") .unwrap(); } + +#[test] +fn parse_create_table_as_with_data_clause() { + // Postgres / ANSI / Teradata: `CREATE TABLE … AS <query> WITH [NO] DATA + // [AND [NO] STATISTICS]` controls whether the new table is populated + // and whether stats are collected. Doesn't change lineage; clause is + // consumed and discarded. + // https://www.postgresql.org/docs/current/sql-createtableas.html + let cases = [ + "CREATE TABLE asd AS SELECT asd FROM asd WITH DATA", + "CREATE TABLE asd AS SELECT asd FROM asd WITH NO DATA", + "CREATE TABLE a.b AS SELECT 1 WITH DATA AND STATISTICS", + "CREATE TABLE a.b AS SELECT 1 WITH NO DATA AND NO STATISTICS", + ]; + for sql in cases { + pg() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index 8ff5cb24ac..7c37b0b515 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -822,3 +822,52 @@ fn parse_create_external_function_lambda() { ) .unwrap(); } + +#[test] +fn parse_redshift_create_table_diststyle_distkey_sortkey_any_order() { + // Redshift CREATE TABLE accepts DISTSTYLE / DISTKEY / SORTKEY in any + // order after the column definitions. + // https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html + let cases = [ + "CREATE TABLE sales (salesid INTEGER NOT NULL) DISTKEY(listid) COMPOUND SORTKEY(listid, sellerid) DISTSTYLE AUTO", + "CREATE TABLE soup (LIKE other_table) DISTKEY(soup1) SORTKEY(soup2) DISTSTYLE ALL", + "CREATE TABLE t (a INT) DISTSTYLE KEY DISTKEY(a) SORTKEY(a)", + "CREATE TABLE t (a INT) SORTKEY(a) DISTKEY(a)", + ]; + for sql in cases { + redshift() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_redshift_oracle_outer_join_marker() { + // Redshift supports the Oracle/Snowflake legacy `expr (+)` outer-join + // marker in the WHERE clause of comma-join queries. + redshift() + .parse_sql_statements("select a.foo from a, b where a.baz = b.baz (+)") + .unwrap(); + redshift() + .parse_sql_statements("select * from a, b where a.id (+) = b.id") + .unwrap(); +} + +#[test] +fn parse_redshift_identity_seed_step() { + // Redshift's two-argument IDENTITY shorthand: `(seed, step)` instead + // of `(START WITH seed INCREMENT BY step)`. + // https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html + redshift() + .parse_sql_statements("CREATE TABLE t (c BIGINT GENERATED BY DEFAULT AS IDENTITY (0, 1))") + .unwrap(); + redshift() + .parse_sql_statements("CREATE TABLE t (c BIGINT GENERATED ALWAYS AS IDENTITY (100, 5))") + .unwrap(); + // Existing keyword form still parses. + redshift() + .parse_sql_statements( + "CREATE TABLE t (c BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1))", + ) + .unwrap(); +} diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index 02f48a7e29..051eb32830 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -1429,15 +1429,14 @@ fn test_snowflake_trim() { expr_from_projection(only(&select.projection)) ); - // missing comma separation - let error_sql = "SELECT TRIM('xyz' 'a')"; - assert_eq!( - ParserError::ParserError( - "Expected ), found: 'a'\nNear `SELECT TRIM('xyz'`" - .to_owned() - .into() - ), - snowflake().parse_sql_statements(error_sql).unwrap_err() + // Adjacent string literals concatenate per ANSI SQL / Snowflake docs, so + // `TRIM('xyz' 'a')` is a valid 1-arg TRIM on the literal `'xyza'`. + // https://docs.snowflake.com/en/sql-reference/data-types-text#string-constants + // (No roundtrip — parser produces `'xyza'`, which doesn't re-render to + // the two-literal source form.) + snowflake().one_statement_parses_to( + "SELECT TRIM('xyz' 'a')", + "SELECT TRIM('xyza')", ); } @@ -1602,10 +1601,7 @@ fn parse_view_as_table_alias() { // alias position it should be treated as an ordinary identifier — e.g. // `FROM tbl VIEW` or `JOIN tbl AS VIEW`. snowflake().verified_stmt("SELECT VIEW.x FROM t AS VIEW"); - snowflake().one_statement_parses_to( - "SELECT * FROM t VIEW", - "SELECT * FROM t AS VIEW", - ); + snowflake().one_statement_parses_to("SELECT * FROM t VIEW", "SELECT * FROM t AS VIEW"); snowflake().one_statement_parses_to( "SELECT VIEW.x FROM tbl FULL OUTER JOIN tbl2 AS VIEW ON tbl.id = VIEW.id", "SELECT VIEW.x FROM tbl FULL JOIN tbl2 AS VIEW ON tbl.id = VIEW.id", @@ -3450,6 +3446,76 @@ fn parse_create_semantic_view_table_synonyms_and_comment() { ); } +#[test] +fn parse_adjacent_string_literal_concatenation() { + // ANSI SQL / BigQuery / Postgres / Snowflake all concatenate adjacent + // string literals separated by whitespace into a single literal. Real + // customer SQL relies on this — typically as a forgotten comma in an IN + // list, but the SQL still runs correctly because of concatenation. + // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + snowflake().one_statement_parses_to( + "SELECT 'a' 'b' 'c' FROM t", + "SELECT 'abc' FROM t", + ); + snowflake().one_statement_parses_to( + "SELECT * FROM t WHERE x IN ('a', 'b' 'c', 'd')", + "SELECT * FROM t WHERE x IN ('a', 'bc', 'd')", + ); +} + +#[test] +fn parse_snowflake_interval_as_column_with_binary_operators() { + // Snowflake allows INTERVAL as a column name. The interval-literal guard + // must reject the literal path when the token after INTERVAL is a binary + // operator keyword (BETWEEN, AND, OR, XOR, IN, NOT) — otherwise the + // greedy expression parser inside `parse_interval` swallows the keyword + // as the literal's "value" and breaks the surrounding clause. + let cases = [ + "SELECT * FROM t WHERE INTERVAL BETWEEN 1 AND 2", + "SELECT a, INTERVAL FROM t", + "SELECT MAX(INTERVAL) FROM t", + "SELECT * FROM t WHERE INTERVAL IN (1, 2)", + "SELECT * FROM t WHERE INTERVAL NOT IN (1, 2)", + // INTERVAL inside a window's PARTITION BY list (caught a regression + // where ORDER was misconsumed as the interval's "value"). + "SELECT ROW_NUMBER() OVER (PARTITION BY a, INTERVAL ORDER BY c) FROM t", + // The literal form must still parse. + "SELECT INTERVAL '1' DAY", + "SELECT INTERVAL 1 DAY", + "SELECT INTERVAL -1 DAY", + "SELECT * FROM t WHERE x = INTERVAL '1' DAY", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_snowflake_identifier_literal_in_table_position() { + // Snowflake `IDENTIFIER('<name>')` accepts a string literal anywhere a + // name is expected: CREATE TABLE target, FROM source, INSERT INTO, + // multi-part dotted names embedded in the string. + // https://docs.snowflake.com/en/sql-reference/identifier-literal + let cases = [ + "SELECT * FROM IDENTIFIER('mytable')", + "SELECT * FROM IDENTIFIER('db.schema.tbl') AS x", + "INSERT INTO IDENTIFIER('foo.bar') SELECT 1", + "CREATE OR REPLACE TEMP TABLE IDENTIFIER('proj.ds.t') AS SELECT 1", + // Snowflake also accepts session variables and bind parameters as + // the IDENTIFIER value. + "CREATE TABLE IDENTIFIER($foo) (col1 VARCHAR)", + "SELECT * FROM IDENTIFIER($tbl_name)", + "SELECT * FROM IDENTIFIER(?)", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + #[test] fn parse_snowflake_create_external_table_with_options() { // Snowflake CREATE EXTERNAL TABLE has option-name = value tail @@ -3468,3 +3534,129 @@ fn parse_snowflake_create_external_table_with_options() { assert!(rendered.contains("\"ID\"")); assert!(rendered.contains("\"NAME\"")); } + +#[test] +fn parse_snowflake_create_schema_clone() { + // Snowflake zero-copy clone of a schema: + // CREATE SCHEMA new CLONE source [AT|BEFORE (TIMESTAMP|OFFSET|STATEMENT => …)] + // https://docs.snowflake.com/en/sql-reference/sql/create-clone + let cases = [ + "CREATE SCHEMA mytestschema_clone CLONE testschema", + "CREATE SCHEMA restored_schema CLONE my_schema AT (OFFSET => -3600)", + "CREATE SCHEMA mytestschema_clone_restore CLONE testschema \ + BEFORE (TIMESTAMP => TO_TIMESTAMP(40 * 365 * 86400))", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_snowflake_date_part_from() { + // Snowflake's DATE_PART accepts both function-call and ANSI EXTRACT + // forms: + // DATE_PART(<part>, <expr>) -- function-call + // DATE_PART(<part> FROM <expr>) -- EXTRACT-style + // https://docs.snowflake.com/en/sql-reference/functions/date_part + let cases = [ + "SELECT DATE_PART('month' FROM CAST('2024-04-08' AS DATE))", + "SELECT DATE_PART(day FROM a)", + "SELECT DATE_PART(year FROM CAST('2024-04-08' AS DATE))", + "SELECT DATE_PART('month', CAST('2024-04-08' AS DATE))", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_snowflake_create_external_table_partition_by() { + // Snowflake CREATE EXTERNAL TABLE accepts a `PARTITION BY (col, col, …)` + // clause referencing columns from the column-def list, before the + // option block (LOCATION=…, FILE_FORMAT=(…), etc.). + // https://docs.snowflake.com/en/sql-reference/sql/create-external-table + let sql = "CREATE EXTERNAL TABLE et2 (\ + col1 DATE AS (CAST(GET_PATH(PARSE_JSON(metadata$external_table_partition), 'COL1') AS DATE)), \ + col2 VARCHAR AS (CAST(GET_PATH(PARSE_JSON(metadata$external_table_partition), 'COL2') AS VARCHAR))\ + ) PARTITION BY (col1, col2) \ + LOCATION=@s2/logs/ partition_type=user_specified \ + FILE_FORMAT=(type=parquet compression=gzip)"; + snowflake().parse_sql_statements(sql).unwrap(); +} + +#[test] +fn parse_snowflake_create_sequence_comment() { + // Snowflake CREATE SEQUENCE accepts a `COMMENT = '<string>'` option + // alongside START, INCREMENT, ORDER/NOORDER, etc. + // https://docs.snowflake.com/en/sql-reference/sql/create-sequence + let cases = [ + "CREATE SEQUENCE seq START=5 comment = 'foo' INCREMENT=10", + "CREATE SEQUENCE seq3 COMMENT = 'description'", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_snowflake_create_stage_options() { + // Snowflake CREATE STAGE accepts FILE_FORMAT in three shapes + // (parenthesised options, string shorthand, dotted-ident shorthand) + // and the option clauses (URL/CREDENTIALS/FILE_FORMAT/COPY_OPTIONS/ + // COMMENT/ENCRYPTION/STORAGE_INTEGRATION/ENDPOINT) can appear in any + // order. + // https://docs.snowflake.com/en/sql-reference/sql/create-stage + let cases = [ + "CREATE STAGE stage1 FILE_FORMAT='format1'", + "CREATE STAGE stage1 FILE_FORMAT=schema1.format1", + "CREATE STAGE stage1 FILE_FORMAT=(FORMAT_NAME=stage1.format1)", + "CREATE STAGE s1 URL='s3://bucket-123' FILE_FORMAT=(TYPE='JSON') \ + CREDENTIALS=(aws_key_id='test' aws_secret_key='test')", + ]; + for sql in cases { + snowflake() + .parse_sql_statements(sql) + .unwrap_or_else(|e| panic!("failed to parse `{sql}`: {e}")); + } +} + +#[test] +fn parse_snowflake_column_comment_dollar_quoted() { + // Snowflake (and Postgres) accept dollar-quoted strings for column + // COMMENT bodies, useful when the comment contains single quotes. + snowflake() + .parse_sql_statements("CREATE TABLE foo (ID INT COMMENT $$some comment$$)") + .unwrap(); +} + +#[test] +fn parse_snowflake_column_foreign_key_references() { + // Snowflake column-level: `<col> <type> [NOT NULL] FOREIGN KEY + // REFERENCES <ref_table> [(<ref_col>)]` — same as the ANSI/Postgres + // shorter form, just with an explicit `FOREIGN KEY` prefix. + // https://docs.snowflake.com/en/sql-reference/sql/create-table + snowflake() + .parse_sql_statements( + "CREATE OR REPLACE TABLE TEST (\ + SOME_REF DECIMAL(38, 0) NOT NULL FOREIGN KEY REFERENCES SOME_OTHER_TABLE (ID))", + ) + .unwrap(); + // Existing short form still parses. + snowflake() + .parse_sql_statements("CREATE TABLE T (a INT REFERENCES other(id))") + .unwrap(); +} + +#[test] +fn parse_snowflake_table_function_tablesample() { + // TABLESAMPLE applied to a TABLE(<expr>) call. + snowflake() + .parse_sql_statements("SELECT * FROM TABLE('t1') TABLESAMPLE BERNOULLI (20.3)") + .unwrap(); +}