diff --git a/.github/workflows/det-gates.yml b/.github/workflows/det-gates.yml index 02d5f773..0abe9e32 100644 --- a/.github/workflows/det-gates.yml +++ b/.github/workflows/det-gates.yml @@ -220,12 +220,17 @@ jobs: - name: Run benchmarks run: | cargo bench -p warp-benches --bench materialization_hotpath -- --output-format bencher | tee perf.log + - name: Check regression against baseline + run: | + node scripts/check_perf_regression.cjs perf-baseline.json perf.log --threshold 15 - name: Upload perf artifacts if: always() uses: actions/upload-artifact@v4 with: name: perf-artifacts - path: perf.log + path: | + perf.log + perf-report.json build-repro: name: G4 build reproducibility (wasm) diff --git a/.github/workflows/perf-baseline-update.yml b/.github/workflows/perf-baseline-update.yml new file mode 100644 index 00000000..905eecb6 --- /dev/null +++ b/.github/workflows/perf-baseline-update.yml @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# © James Ross Ω FLYING•ROBOTS +# +# Auto-update perf-baseline.json on main after merges that touch DET-critical +# or DET-important code. Creates a PR with the new baseline so it's reviewed +# (never force-pushes or commits directly to main). +name: Update perf baseline + +on: + push: + branches: [main] + +permissions: + contents: write + pull-requests: write + +concurrency: + group: perf-baseline-update + cancel-in-progress: true + +jobs: + update-baseline: + name: Update perf baseline + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check if perf-relevant files changed + id: changed + run: | + CHANGED=$(git diff --name-only HEAD~1..HEAD -- 'crates/**/*.rs' 'crates/**/Cargo.toml' || true) + if [ -z "$CHANGED" ]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "No Rust source changes — skipping baseline update" + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + + - name: Setup Rust + if: steps.changed.outputs.skip != 'true' + uses: dtolnay/rust-toolchain@stable + + - name: Run benchmarks + if: steps.changed.outputs.skip != 'true' + run: | + cargo bench -p warp-benches --bench materialization_hotpath -- --output-format bencher | tee perf.log + + - name: Generate baseline JSON + id: generate + if: steps.changed.outputs.skip != 'true' + run: | + node scripts/generate_perf_baseline.cjs perf.log > perf-baseline-new.json + if diff -q perf-baseline.json perf-baseline-new.json >/dev/null 2>&1; then + echo "Baseline unchanged — no PR needed" + echo "skip=true" >> "$GITHUB_OUTPUT" + else + mv perf-baseline-new.json perf-baseline.json + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + + - name: Create baseline PR + if: steps.changed.outputs.skip != 'true' && steps.generate.outputs.skip != 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BRANCH="chore/perf-baseline-$(date +%Y%m%d)-$(git rev-parse --short HEAD)" + git checkout -b "$BRANCH" + git add perf-baseline.json + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git commit -m "chore(perf): update perf-baseline.json from $(git rev-parse --short HEAD~1)" + git push origin "$BRANCH" + gh pr create \ + --title "chore(perf): update perf baseline" \ + --body "Auto-generated baseline update from main push $(git rev-parse HEAD~1)." \ + --base main \ + --head "$BRANCH" diff --git a/CHANGELOG.md b/CHANGELOG.md index 77935b5f..bcd1d6ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ ## Unreleased +### CI: G3 perf regression gate (#280) + +- **CI:** G3 perf regression gate now compares criterion benchmark output + against a git-tracked `perf-baseline.json` and fails if any benchmark + regresses beyond 15% (configurable via `--threshold`). Structured + `perf-report.json` artifact uploaded alongside raw `perf.log`. +- **CI:** New `perf-baseline-update.yml` workflow auto-generates baseline + update PRs on main pushes that touch Rust sources. +- **Scripts:** Added `check_perf_regression.cjs` (gate comparison) and + `generate_perf_baseline.cjs` (baseline generation from bencher output). + +### Docs: Allowlist governance (#287) + +- **Policy:** Added "Determinism Allowlist Governance" section to + `docs/RELEASE_POLICY.md` documenting acceptable exemption criteria, + approval requirements, and audit cadence for `.ban-nondeterminism-allowlist`. +- **Scripts:** Added cross-reference from `ban-nondeterminism.sh` header to + the governance policy. + ### Docs Polish (#41) - **License:** Renamed SPDX identifier `MIND-UCAL-1.0` → diff --git a/docs/RELEASE_POLICY.md b/docs/RELEASE_POLICY.md index 5a1026c2..0795c1dc 100644 --- a/docs/RELEASE_POLICY.md +++ b/docs/RELEASE_POLICY.md @@ -69,6 +69,55 @@ A gate may be marked VERIFIED only with immutable pointers: No immutable evidence => gate must be INFERRED or UNVERIFIED. +## Determinism Allowlist Governance + +The static inspection gate (G1 / DET-001) uses `scripts/ban-nondeterminism.sh` +to scan DET-critical crate sources for nondeterministic API patterns. A +file-level allowlist (`.ban-nondeterminism-allowlist`) may exempt specific paths +from this scan. + +### Allowlist Location and Format + +- **File:** `.ban-nondeterminism-allowlist` (project root) +- **Format:** One file path per line. Blank lines and `#`-prefixed comments are + ignored. Inline justifications may follow the path, separated by whitespace. +- **Env override:** `DETERMINISM_ALLOWLIST` (defaults to + `.ban-nondeterminism-allowlist`). + +### When an Exemption Is Acceptable + +An allowlist entry is appropriate **only** when all of the following hold: + +1. The nondeterministic API is **not reachable from the WASM deterministic + execution path** (e.g., native-only tooling, test-only I/O, build-time + configuration read once at startup). +2. The call site is **guarded** by a feature gate, `#[cfg(test)]`, or an + explicit runtime assertion that prevents it from executing in the + deterministic engine loop. +3. Refactoring to remove the API usage would introduce **worse** architectural + trade-offs (e.g., duplicating an entire module to avoid a single `std::fs` + call in a CLI-only code path). + +If the API is reachable from the deterministic engine loop under any +configuration, **do not allowlist it. Refactor instead.** + +### Approval Requirements + +- Every new allowlist entry **must** include an inline justification explaining + why the exemption is safe. +- The entry must be approved by the **Architect** or **crate owner** as defined + in `det-policy.yaml` for the affected crate. +- PRs adding allowlist entries must tag the determinism label and reference this + policy section. + +### Audit + +- Existing entries are reviewed during each milestone closeout. +- Entries whose justification no longer holds (e.g., the guarding feature gate + was removed) must be deleted and the underlying code refactored. +- The `check_task_lists.sh` pre-commit hook does **not** cover allowlist + auditing; this is a manual review gate. + ## Escalation If staging/prod blocker state conflicts with recommendation: diff --git a/perf-baseline.json b/perf-baseline.json new file mode 100644 index 00000000..2c63c085 --- /dev/null +++ b/perf-baseline.json @@ -0,0 +1,2 @@ +{ +} diff --git a/scripts/ban-nondeterminism.sh b/scripts/ban-nondeterminism.sh index a603dfb6..11b3b5c7 100755 --- a/scripts/ban-nondeterminism.sh +++ b/scripts/ban-nondeterminism.sh @@ -11,6 +11,9 @@ set -euo pipefail # Optional env: # DETERMINISM_PATHS="crates/warp-core crates/warp-wasm crates/echo-wasm-abi" # DETERMINISM_ALLOWLIST=".ban-nondeterminism-allowlist" +# +# Allowlist governance: see docs/RELEASE_POLICY.md § "Determinism Allowlist Governance" +# for approval requirements, acceptable exemption criteria, and audit cadence. ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$ROOT" diff --git a/scripts/check_perf_regression.cjs b/scripts/check_perf_regression.cjs new file mode 100644 index 00000000..60b6fd87 --- /dev/null +++ b/scripts/check_perf_regression.cjs @@ -0,0 +1,169 @@ +#!/usr/bin/env node +// SPDX-License-Identifier: Apache-2.0 +// © James Ross Ω FLYING•ROBOTS +// +// G3 perf regression gate: compare current criterion bencher output against a +// git-tracked baseline and fail if any benchmark regresses beyond the allowed +// threshold. +// +// Usage: +// node scripts/check_perf_regression.cjs [--threshold 15] +// +// Baseline format (perf-baseline.json): +// { "": , ... } +// +// Current format (criterion --output-format bencher): +// test ... bench: ns/iter (+/- ) +// +// Exit codes: +// 0 — no regressions above threshold +// 1 — one or more regressions above threshold +// 2 — usage error + +"use strict"; + +const fs = require("fs"); + +const USAGE = "Usage: node scripts/check_perf_regression.cjs [--threshold ]"; + +function parseArgs(argv) { + const args = argv.slice(2); + let threshold = 15; + const positional = []; + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--threshold" && i + 1 < args.length) { + threshold = Number(args[++i]); + if (Number.isNaN(threshold) || threshold <= 0) { + console.error("ERROR: --threshold must be a positive number"); + process.exit(2); + } + } else if (args[i].startsWith("-")) { + console.error(`ERROR: unknown flag: ${args[i]}`); + console.error(USAGE); + process.exit(2); + } else { + positional.push(args[i]); + } + } + + if (positional.length !== 2) { + console.error(USAGE); + process.exit(2); + } + + return { baselinePath: positional[0], currentPath: positional[1], threshold }; +} + +/** Parse criterion bencher output into { name: median_ns } */ +function parseBencherOutput(text) { + const results = {}; + // Format: "test ... bench: ns/iter (+/- )" + const re = /^test\s+(\S+)\s+\.\.\.\s+bench:\s+([\d,]+)\s+ns\/iter/gm; + let match; + while ((match = re.exec(text)) !== null) { + const name = match[1]; + const ns = Number(match[2].replace(/,/g, "")); + results[name] = ns; + } + return results; +} + +function main() { + const { baselinePath, currentPath, threshold } = parseArgs(process.argv); + + if (!fs.existsSync(baselinePath)) { + console.log(`No baseline found at ${baselinePath} — recording current run as baseline.`); + console.log("G3: SKIP (no baseline to compare against)"); + process.exit(0); + } + + const baseline = JSON.parse(fs.readFileSync(baselinePath, "utf-8")); + const currentText = fs.readFileSync(currentPath, "utf-8"); + const current = parseBencherOutput(currentText); + + const benchNames = Object.keys(current); + if (benchNames.length === 0) { + console.error("ERROR: no benchmark results found in current output"); + process.exit(2); + } + + console.log(`G3 perf regression gate (threshold: ${threshold}%)`); + console.log("─".repeat(72)); + + const report = []; + let regressions = 0; + + for (const name of benchNames) { + const cur = current[name]; + const base = baseline[name]; + + if (base == null) { + report.push({ name, cur, base: null, delta: null, status: "NEW" }); + continue; + } + + const deltaPct = ((cur - base) / base) * 100; + const regressed = deltaPct > threshold; + if (regressed) regressions++; + + report.push({ + name, + cur, + base, + delta: deltaPct, + status: regressed ? "REGRESSED" : "OK", + }); + } + + // Fail when baseline benchmarks disappear from the current run. + // This prevents silent bypass of regression enforcement via benchmark + // renames/removals. To resolve: update perf-baseline.json to remove + // the stale entry (via the baseline update workflow or manually). + for (const name of Object.keys(baseline)) { + if (current[name] == null) { + regressions++; + report.push({ name, cur: null, base: baseline[name], delta: null, status: "MISSING" }); + } + } + + // Print table + const nameWidth = Math.max(12, ...report.map((r) => r.name.length)); + const header = `${"Benchmark".padEnd(nameWidth)} ${"Baseline".padStart(12)} ${"Current".padStart(12)} ${"Delta".padStart(8)} Status`; + console.log(header); + console.log("─".repeat(header.length)); + + for (const r of report) { + const baseStr = r.base != null ? `${r.base} ns` : "—"; + const curStr = r.cur != null ? `${r.cur} ns` : "—"; + const deltaStr = r.delta != null ? `${r.delta > 0 ? "+" : ""}${r.delta.toFixed(1)}%` : "—"; + const statusStr = + r.status === "REGRESSED" ? `FAIL (>${threshold}%)` : + r.status === "MISSING" ? "FAIL (missing)" : + r.status; + console.log( + `${r.name.padEnd(nameWidth)} ${baseStr.padStart(12)} ${curStr.padStart(12)} ${deltaStr.padStart(8)} ${statusStr}` + ); + } + + console.log("─".repeat(header.length)); + + // Write structured report + const reportObj = { + threshold_pct: threshold, + benchmarks: report, + regressions, + passed: regressions === 0, + }; + fs.writeFileSync("perf-report.json", JSON.stringify(reportObj, null, 2) + "\n"); + console.log("\nWrote perf-report.json"); + + if (regressions > 0) { + console.error(`\nG3: FAILED — ${regressions} benchmark(s) regressed beyond ${threshold}% threshold`); + process.exit(1); + } + + console.log(`\nG3: PASSED — all benchmarks within ${threshold}% of baseline`); +} + +main(); diff --git a/scripts/generate_perf_baseline.cjs b/scripts/generate_perf_baseline.cjs new file mode 100644 index 00000000..e54ce776 --- /dev/null +++ b/scripts/generate_perf_baseline.cjs @@ -0,0 +1,41 @@ +#!/usr/bin/env node +// SPDX-License-Identifier: Apache-2.0 +// © James Ross Ω FLYING•ROBOTS +// +// Parse criterion bencher output and emit a perf-baseline.json to stdout. +// +// Usage: +// node scripts/generate_perf_baseline.cjs > perf-baseline.json + +"use strict"; + +const fs = require("fs"); +const path = require("path"); + +if (process.argv.length !== 3) { + console.error(`Usage: node ${path.basename(__filename)} `); + process.exit(2); +} + +const text = fs.readFileSync(process.argv[2], "utf-8"); +const baseline = {}; + +// Format: "test ... bench: ns/iter (+/- )" +const re = /^test\s+(\S+)\s+\.\.\.\s+bench:\s+([\d,]+)\s+ns\/iter/gm; +let match; +while ((match = re.exec(text)) !== null) { + baseline[match[1]] = Number(match[2].replace(/,/g, "")); +} + +if (Object.keys(baseline).length === 0) { + console.error("ERROR: no benchmark results found in input"); + process.exit(1); +} + +// Sort keys for stable diffs +const sorted = {}; +for (const k of Object.keys(baseline).sort()) { + sorted[k] = baseline[k]; +} + +console.log(JSON.stringify(sorted, null, 2));