Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
37385b3
fix(snowflake): parse IDENTIFIER('<name>') literal in name positions
lustefaniak May 5, 2026
8e11379
fix(snowflake): treat INTERVAL as identifier before binary/clause key…
lustefaniak May 5, 2026
8b400f8
fix: concatenate adjacent string literals (ANSI SQL §5.3)
lustefaniak May 5, 2026
9a6cacc
fix(duckdb): support USING SAMPLE clause in FROM table references
lustefaniak May 5, 2026
6fdcb23
fix(mysql): support REPLACE [INTO] statement
lustefaniak May 5, 2026
05fa096
fix: handle COLUMNS(...) clause in JSON_TABLE / XMLTABLE function args
lustefaniak May 5, 2026
9add052
fix(clickhouse): accept ANY/ASOF/ALL join modifiers and GLOBAL prefix
lustefaniak May 5, 2026
489e205
fix(mssql): support system-versioned temporal table column markers
lustefaniak May 5, 2026
f1b4a84
fix(clickhouse): accept ON CLUSTER clause in DELETE statements
lustefaniak May 5, 2026
0692153
fix: accept WITH [NO] DATA [AND [NO] STATISTICS] on CREATE TABLE AS
lustefaniak May 5, 2026
def8c9c
fix(bigquery): accept digit-prefixed path segments in object names
lustefaniak May 5, 2026
2456e64
fix: support Teradata column-level attributes (FORMAT, TITLE, COMPRES…
lustefaniak May 5, 2026
271efa4
fix(snowflake): accept CREATE SCHEMA … CLONE source [AT|BEFORE (…)]
lustefaniak May 6, 2026
735b82e
fix: accept CORRESPONDING [BY (cols)] and STRICT set-op modifiers
lustefaniak May 6, 2026
3d7b48d
fix(snowflake): accept DATE_PART(<part> FROM <expr>) ANSI form
lustefaniak May 6, 2026
a6c4222
fix(snowflake): accept PARTITION BY (cols) on CREATE EXTERNAL TABLE
lustefaniak May 6, 2026
66795ff
fix(snowflake): accept COMMENT='…' option in CREATE SEQUENCE
lustefaniak May 6, 2026
4933978
fix(redshift): accept DISTSTYLE / DISTKEY / SORTKEY in any order
lustefaniak May 6, 2026
7348185
corpus-runner: route athena to HiveDialect
lustefaniak May 6, 2026
9351859
fix(hive): support WITH SERDEPROPERTIES (...) and DELIMITED suboptions
lustefaniak May 6, 2026
7324010
fix(hive,athena): accept Iceberg-style expression PARTITIONED BY
lustefaniak May 6, 2026
6e8b126
fix(hive,athena): accept table-level COMMENT and CLUSTERED BY clauses
lustefaniak May 6, 2026
1119207
fix(bigquery): accept set-op suffixes in any order, plus ON (cols)
lustefaniak May 6, 2026
d98d391
fix(redshift): accept Oracle/Snowflake (+) outer-join marker
lustefaniak May 6, 2026
be547d4
fix(snowflake): expand CREATE STAGE option grammar
lustefaniak May 6, 2026
49729bb
fix(snowflake): accept dollar-quoted strings for column COMMENT
lustefaniak May 6, 2026
0ea2a0a
fix(bigquery): accept FOR SYSTEM_TIME AS OF after table alias
lustefaniak May 6, 2026
191e8e1
fix: treat reserved-keyword followed by ) as column name, not trailin…
lustefaniak May 6, 2026
65e45f5
fix(snowflake): accept FOREIGN KEY REFERENCES inline column constraint
lustefaniak May 6, 2026
ce0db72
fix(bigquery): accept [NOT] DETERMINISTIC marker in CREATE FUNCTION body
lustefaniak May 6, 2026
3e6a36c
fix(redshift): accept GENERATED AS IDENTITY (seed, step) two-arg shor…
lustefaniak May 6, 2026
e9836fb
fix(snowflake): allow TABLESAMPLE after FROM TABLE(<expr>) reference
lustefaniak May 6, 2026
c8e7b42
fix(snowflake): accept session variable and bind parameter in IDENTIF…
lustefaniak May 6, 2026
38a0e26
fix(bigquery): parse legacy SQL [project-id:dataset.table] table refs
lustefaniak May 6, 2026
76b47c2
fix(bigquery): accept double-quoted string after AT TIME ZONE
lustefaniak May 6, 2026
5fac260
docs(CLAUDE): record corpus-loop session learnings
lustefaniak May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ cargo clippy
- Use `cargo run --release --quiet --features json_example --example cli FILE --DIALECT 2>&1 1>/dev/null | grep "Error during parsing"` — DEBUG logs flood stderr by default and obscure the actual parse error.
- The release CLI (`target/release/examples/cli`) is rebuilt independently from `corpus-runner`. After parser edits, run `cargo build --release --example cli` before re-running single-file repros — otherwise you're testing the previous build and may report false positives.

### Backgrounding gotcha

- **`cmd > log 2>&1 &` in a Bash tool call with `run_in_background: true` returns "completed" immediately** because the shell exits while the `&`-detached child keeps running. Don't trust the completion notification for detached processes — verify with `pgrep -f <name>` or arm a Monitor that polls `pgrep`.

### Performance and Profiling

**Critical:** Always profile BEFORE optimizing. Assumptions about bottlenecks are often wrong.
Expand Down Expand Up @@ -115,6 +119,10 @@ node scripts/compare-corpus-reports.js target/corpus-report.json target/corpus-r
- Analyze failures: parse `target/corpus-report.json` with Python to filter/group `test_results` by dialect or error pattern
- **Always rebuild before corpus run**: `cargo build --release --bin corpus-runner` — stale binary produces stale reports
- **Refresh baseline after each accepted commit**: `cp target/corpus-report.json target/corpus-report-baseline.json`. Otherwise `compare-corpus-reports.js` credits old deltas and can hide fresh regressions.
- `compare-corpus-reports.js` only lists *added* tests under "New Tests" — deleted/pruned files don't appear there. After a kernel-cll-corpus pipeline run, also check `git status -s` in that repo to see what was removed.
- **Pipeline reprocess (`make process` in kernel-cll-corpus) takes ~10 minutes** for the full corpus. Don't poll — arm a Monitor on `while pgrep -f pipeline.process; do sleep 15; done` and let it wake you.
- **Anonymizer-corruption signature**: `'s'<word>` (the `'s'` placeholder string directly abutting an identifier/keyword, e.g. `'s'HOUR`, `'s'id_5`) is unique to anonymizer misalignment. Filter on exactly `'s'<word>` — a broader `'<anything>'<word>` regex misaligns on multi-string SQL (`'foo','bar'`) and silently deletes hand-written sqlglot fixtures.
- **Query-log truncation heuristics** (`pipeline/process.py::_looks_truncated`) that worked without false positives: trailing punctuation (`,`/`(`/`=`/operator), trailing clause keyword (SELECT/FROM/BY/AS/…), and `CASE` count > `END` count. Removed ~4k Redshift query-log fragments.
- Adding a real dispatch in `parse_create` for a previously-unsupported `CREATE <X>` shape can flag *new* corpus failures: files that slipped through the generic skip-until-semicolon fallback are now actually parsed. Either extend support, accept on a case-by-case basis, or fall back gracefully — but expect the delta.
- **This repo is PUBLIC** (`getsynq/sqlparser-rs`, a fork of `apache/datafusion-sqlparser-rs`). Never put customer names, workspace IDs, or internal codenames into commit messages, branch names, PR titles, file names, or function names — even anonymized SQL content must be attributed generically. Every push is mirrored into GH Archive's permanent public dataset, which force-push cannot undo.
- **Pulling real SQL from production Clickhouse for regression coverage**: `SELECT sql FROM schema.latest_sql_definitions FINAL WHERE workspace='<name>' AND asset_type IN (...)`. `asset_type` codes from `proto/core/types/v1/asset_type.proto` that carry SQL bodies parseable by this library:
Expand Down
6 changes: 4 additions & 2 deletions src/bin/corpus_runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ fn normalize_dialect_name(name: &str) -> &str {
/// Dialects without a dedicated parser fall back to a related dialect or to
/// `GenericDialect` rather than being silently skipped, so corpus stats reflect
/// every file under `tests/corpus/`. Aliases are best-effort:
/// - `presto` / `athena` use Trino-style SQL → Generic (same as our `trino`)
/// - `presto` uses Trino-style SQL → Trino
/// - `athena` uses Hive-style DDL on top of Trino-style DML → Hive
/// - `tsql` / `fabric` use T-SQL → MsSql
/// - `spark` uses Spark SQL → Databricks
/// - `materialize` is Postgres-compatible → Postgres
Expand All @@ -38,7 +39,8 @@ fn dialect_for_name(name: &str) -> Box<dyn sqlparser::dialect::Dialect> {
return d;
}
let alias: &str = match base_name.as_str() {
"presto" | "athena" => "trino",
"presto" => "trino",
"athena" => "hive",
"tsql" | "fabric" => "mssql",
"spark" => "databricks",
"materialize" => "postgres",
Expand Down
120 changes: 93 additions & 27 deletions src/dialect/snowflake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,33 +104,86 @@ pub fn parse_create_stage(
let mut comment = None;

// [ internalStageParams | externalStageParams ]
let stage_params = parse_stage_params(parser)?;
let mut stage_params = parse_stage_params(parser)?;

// [ directoryTableParams ]
if parser.parse_keyword(Keyword::DIRECTORY) {
parser.expect_token(&Token::Eq)?;
directory_table_params = parse_parentheses_options(parser)?;
}

// [ file_format]
if parser.parse_keyword(Keyword::FILE_FORMAT) {
parser.expect_token(&Token::Eq)?;
file_format = parse_parentheses_options(parser)?;
}

// [ copy_options ]
if parser.parse_keyword(Keyword::COPY_OPTIONS) {
parser.expect_token(&Token::Eq)?;
copy_options = parse_parentheses_options(parser)?;
}

// [ comment ]
if parser.parse_keyword(Keyword::COMMENT) {
parser.expect_token(&Token::Eq)?;
comment = Some(match parser.next_token().token {
Token::SingleQuotedString(word) => Ok(word),
_ => parser.expected("a comment statement", parser.peek_token()),
}?)
// CREATE STAGE option clauses (DIRECTORY, FILE_FORMAT, COPY_OPTIONS,
// COMMENT, plus URL/CREDENTIALS/etc that may also appear after the
// initial stage-params block) can come in any order. Loop until none
// of the recognised keywords appear.
// https://docs.snowflake.com/en/sql-reference/sql/create-stage
loop {
if parser.parse_keyword(Keyword::DIRECTORY) {
parser.expect_token(&Token::Eq)?;
directory_table_params = parse_parentheses_options(parser)?;
} else if parser.parse_keyword(Keyword::FILE_FORMAT) {
parser.expect_token(&Token::Eq)?;
if parser.peek_token_is(&Token::LParen) {
file_format = parse_parentheses_options(parser)?;
} else {
// Snowflake accepts FILE_FORMAT shorthand:
// FILE_FORMAT = '<format_name>' (string)
// FILE_FORMAT = [<schema>.]<format_name> (ident)
let next_token = parser.next_token();
let value = match next_token.token {
Token::SingleQuotedString(s) => s,
Token::Word(w) => {
let mut name = w.value;
while parser.consume_token(&Token::Period) {
let part = parser.next_token();
match part.token {
Token::Word(w) => {
name.push('.');
name.push_str(&w.value);
}
_ => parser.expected("identifier after .", part)?,
}
}
name
}
_ => parser.expected("file format name", next_token)?,
};
file_format.push(DataLoadingOption {
option_name: "FORMAT_NAME".to_string(),
option_type: DataLoadingOptionType::STRING,
value,
});
}
} else if parser.parse_keyword(Keyword::COPY_OPTIONS) {
parser.expect_token(&Token::Eq)?;
copy_options = parse_parentheses_options(parser)?;
} else if parser.parse_keyword(Keyword::COMMENT) {
parser.expect_token(&Token::Eq)?;
comment = Some(match parser.next_token().token {
Token::SingleQuotedString(word) => Ok(word),
_ => parser.expected("a comment statement", parser.peek_token()),
}?);
} else if matches!(
parser.peek_token_kind(),
Token::Word(w) if matches!(w.keyword,
Keyword::URL | Keyword::CREDENTIALS | Keyword::STORAGE_INTEGRATION
| Keyword::ENDPOINT | Keyword::ENCRYPTION)
) {
// Stage-params clauses can also appear after FILE_FORMAT etc.;
// re-enter the parser and merge the result.
let extra = parse_stage_params(parser)?;
if extra.url.is_some() {
stage_params.url = extra.url;
}
if extra.storage_integration.is_some() {
stage_params.storage_integration = extra.storage_integration;
}
if extra.endpoint.is_some() {
stage_params.endpoint = extra.endpoint;
}
if !extra.credentials.options.is_empty() {
stage_params.credentials = extra.credentials;
}
if !extra.encryption.options.is_empty() {
stage_params.encryption = extra.encryption;
}
} else {
break;
}
}

Ok(Statement::CreateStage {
Expand Down Expand Up @@ -588,10 +641,23 @@ fn parse_parentheses_options(parser: &mut Parser) -> Result<Vec<DataLoadingOptio
Ok(())
}
Token::Word(word) => {
// Allow dotted object names (e.g.
// `FORMAT_NAME=schema.format`).
let mut value = word.value;
while parser.consume_token(&Token::Period) {
let part = parser.next_token();
match part.token {
Token::Word(w) => {
value.push('.');
value.push_str(&w.value);
}
_ => parser.expected("identifier after .", part)?,
}
}
options.push(DataLoadingOption {
option_name: key.value,
option_type: DataLoadingOptionType::ENUM,
value: word.value,
value,
});
Ok(())
}
Expand Down
Loading
Loading