getsynq · lustefaniak · May 6, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -39,6 +39,10 @@ cargo clippy
 - Use `cargo run --release --quiet --features json_example --example cli FILE --DIALECT 2>&1 1>/dev/null | grep "Error during parsing"` — DEBUG logs flood stderr by default and obscure the actual parse error.
 - The release CLI (`target/release/examples/cli`) is rebuilt independently from `corpus-runner`. After parser edits, run `cargo build --release --example cli` before re-running single-file repros — otherwise you're testing the previous build and may report false positives.
 
+### Backgrounding gotcha
+
+- **`cmd > log 2>&1 &` in a Bash tool call with `run_in_background: true` returns "completed" immediately** because the shell exits while the `&`-detached child keeps running. Don't trust the completion notification for detached processes — verify with `pgrep -f <name>` or arm a Monitor that polls `pgrep`.
+
 ### Performance and Profiling
 
 **Critical:** Always profile BEFORE optimizing. Assumptions about bottlenecks are often wrong.
@@ -115,6 +119,10 @@ node scripts/compare-corpus-reports.js target/corpus-report.json target/corpus-r
 - Analyze failures: parse `target/corpus-report.json` with Python to filter/group `test_results` by dialect or error pattern
 - **Always rebuild before corpus run**: `cargo build --release --bin corpus-runner` — stale binary produces stale reports
 - **Refresh baseline after each accepted commit**: `cp target/corpus-report.json target/corpus-report-baseline.json`. Otherwise `compare-corpus-reports.js` credits old deltas and can hide fresh regressions.
+- `compare-corpus-reports.js` only lists *added* tests under "New Tests" — deleted/pruned files don't appear there. After a kernel-cll-corpus pipeline run, also check `git status -s` in that repo to see what was removed.
+- **Pipeline reprocess (`make process` in kernel-cll-corpus) takes ~10 minutes** for the full corpus. Don't poll — arm a Monitor on `while pgrep -f pipeline.process; do sleep 15; done` and let it wake you.
+- **Anonymizer-corruption signature**: `'s'<word>` (the `'s'` placeholder string directly abutting an identifier/keyword, e.g. `'s'HOUR`, `'s'id_5`) is unique to anonymizer misalignment. Filter on exactly `'s'<word>` — a broader `'<anything>'<word>` regex misaligns on multi-string SQL (`'foo','bar'`) and silently deletes hand-written sqlglot fixtures.
+- **Query-log truncation heuristics** (`pipeline/process.py::_looks_truncated`) that worked without false positives: trailing punctuation (`,`/`(`/`=`/operator), trailing clause keyword (SELECT/FROM/BY/AS/…), and `CASE` count > `END` count. Removed ~4k Redshift query-log fragments.
 - Adding a real dispatch in `parse_create` for a previously-unsupported `CREATE <X>` shape can flag *new* corpus failures: files that slipped through the generic skip-until-semicolon fallback are now actually parsed. Either extend support, accept on a case-by-case basis, or fall back gracefully — but expect the delta.
 - **This repo is PUBLIC** (`getsynq/sqlparser-rs`, a fork of `apache/datafusion-sqlparser-rs`). Never put customer names, workspace IDs, or internal codenames into commit messages, branch names, PR titles, file names, or function names — even anonymized SQL content must be attributed generically. Every push is mirrored into GH Archive's permanent public dataset, which force-push cannot undo.
 - **Pulling real SQL from production Clickhouse for regression coverage**: `SELECT sql FROM schema.latest_sql_definitions FINAL WHERE workspace='<name>' AND asset_type IN (...)`. `asset_type` codes from `proto/core/types/v1/asset_type.proto` that carry SQL bodies parseable by this library:

diff --git a/src/bin/corpus_runner.rs b/src/bin/corpus_runner.rs
@@ -26,7 +26,8 @@ fn normalize_dialect_name(name: &str) -> &str {
 /// Dialects without a dedicated parser fall back to a related dialect or to
 /// `GenericDialect` rather than being silently skipped, so corpus stats reflect
 /// every file under `tests/corpus/`. Aliases are best-effort:
-///   - `presto` / `athena` use Trino-style SQL → Generic (same as our `trino`)
+///   - `presto` uses Trino-style SQL → Trino
+///   - `athena` uses Hive-style DDL on top of Trino-style DML → Hive
 ///   - `tsql` / `fabric` use T-SQL → MsSql
 ///   - `spark` uses Spark SQL → Databricks
 ///   - `materialize` is Postgres-compatible → Postgres
@@ -38,7 +39,8 @@ fn dialect_for_name(name: &str) -> Box<dyn sqlparser::dialect::Dialect> {
         return d;
     }
     let alias: &str = match base_name.as_str() {
-        "presto" | "athena" => "trino",
+        "presto" => "trino",
+        "athena" => "hive",
         "tsql" | "fabric" => "mssql",
         "spark" => "databricks",
         "materialize" => "postgres",

diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs
@@ -104,33 +104,86 @@ pub fn parse_create_stage(
     let mut comment = None;
 
     // [ internalStageParams | externalStageParams ]
-    let stage_params = parse_stage_params(parser)?;
+    let mut stage_params = parse_stage_params(parser)?;
 
-    // [ directoryTableParams ]
-    if parser.parse_keyword(Keyword::DIRECTORY) {
-        parser.expect_token(&Token::Eq)?;
-        directory_table_params = parse_parentheses_options(parser)?;
-    }
-
-    // [ file_format]
-    if parser.parse_keyword(Keyword::FILE_FORMAT) {
-        parser.expect_token(&Token::Eq)?;
-        file_format = parse_parentheses_options(parser)?;
-    }
-
-    // [ copy_options ]
-    if parser.parse_keyword(Keyword::COPY_OPTIONS) {
-        parser.expect_token(&Token::Eq)?;
-        copy_options = parse_parentheses_options(parser)?;
-    }
-
-    // [ comment ]
-    if parser.parse_keyword(Keyword::COMMENT) {
-        parser.expect_token(&Token::Eq)?;
-        comment = Some(match parser.next_token().token {
-            Token::SingleQuotedString(word) => Ok(word),
-            _ => parser.expected("a comment statement", parser.peek_token()),
-        }?)
+    // CREATE STAGE option clauses (DIRECTORY, FILE_FORMAT, COPY_OPTIONS,
+    // COMMENT, plus URL/CREDENTIALS/etc that may also appear after the
+    // initial stage-params block) can come in any order. Loop until none
+    // of the recognised keywords appear.
+    // https://docs.snowflake.com/en/sql-reference/sql/create-stage
+    loop {
+        if parser.parse_keyword(Keyword::DIRECTORY) {
+            parser.expect_token(&Token::Eq)?;
+            directory_table_params = parse_parentheses_options(parser)?;
+        } else if parser.parse_keyword(Keyword::FILE_FORMAT) {
+            parser.expect_token(&Token::Eq)?;
+            if parser.peek_token_is(&Token::LParen) {
+                file_format = parse_parentheses_options(parser)?;
+            } else {
+                // Snowflake accepts FILE_FORMAT shorthand:
+                //   FILE_FORMAT = '<format_name>' (string)
+                //   FILE_FORMAT = [<schema>.]<format_name> (ident)
+                let next_token = parser.next_token();
+                let value = match next_token.token {
+                    Token::SingleQuotedString(s) => s,
+                    Token::Word(w) => {
+                        let mut name = w.value;
+                        while parser.consume_token(&Token::Period) {
+                            let part = parser.next_token();
+                            match part.token {
+                                Token::Word(w) => {
+                                    name.push('.');
+                                    name.push_str(&w.value);
+                                }
+                                _ => parser.expected("identifier after .", part)?,
+                            }
+                        }
+                        name
+                    }
+                    _ => parser.expected("file format name", next_token)?,
+                };
+                file_format.push(DataLoadingOption {
+                    option_name: "FORMAT_NAME".to_string(),
+                    option_type: DataLoadingOptionType::STRING,
+                    value,
+                });
+            }
+        } else if parser.parse_keyword(Keyword::COPY_OPTIONS) {
+            parser.expect_token(&Token::Eq)?;
+            copy_options = parse_parentheses_options(parser)?;
+        } else if parser.parse_keyword(Keyword::COMMENT) {
+            parser.expect_token(&Token::Eq)?;
+            comment = Some(match parser.next_token().token {
+                Token::SingleQuotedString(word) => Ok(word),
+                _ => parser.expected("a comment statement", parser.peek_token()),
+            }?);
+        } else if matches!(
+            parser.peek_token_kind(),
+            Token::Word(w) if matches!(w.keyword,
+                Keyword::URL | Keyword::CREDENTIALS | Keyword::STORAGE_INTEGRATION
+                    | Keyword::ENDPOINT | Keyword::ENCRYPTION)
+        ) {
+            // Stage-params clauses can also appear after FILE_FORMAT etc.;
+            // re-enter the parser and merge the result.
+            let extra = parse_stage_params(parser)?;
+            if extra.url.is_some() {
+                stage_params.url = extra.url;
+            }
+            if extra.storage_integration.is_some() {
+                stage_params.storage_integration = extra.storage_integration;
+            }
+            if extra.endpoint.is_some() {
+                stage_params.endpoint = extra.endpoint;
+            }
+            if !extra.credentials.options.is_empty() {
+                stage_params.credentials = extra.credentials;
+            }
+            if !extra.encryption.options.is_empty() {
+                stage_params.encryption = extra.encryption;
+            }
+        } else {
+            break;
+        }
     }
 
     Ok(Statement::CreateStage {
@@ -588,10 +641,23 @@ fn parse_parentheses_options(parser: &mut Parser) -> Result<Vec<DataLoadingOptio
                             Ok(())
                         }
                         Token::Word(word) => {
+                            // Allow dotted object names (e.g.
+                            // `FORMAT_NAME=schema.format`).
+                            let mut value = word.value;
+                            while parser.consume_token(&Token::Period) {
+                                let part = parser.next_token();
+                                match part.token {
+                                    Token::Word(w) => {
+                                        value.push('.');
+                                        value.push_str(&w.value);
+                                    }
+                                    _ => parser.expected("identifier after .", part)?,
+                                }
+                            }
                             options.push(DataLoadingOption {
                                 option_name: key.value,
                                 option_type: DataLoadingOptionType::ENUM,
-                                value: word.value,
+                                value,
                             });
                             Ok(())
                         }