From 1f31f7077da79fcca2ec1219252c3d0e7ce37373 Mon Sep 17 00:00:00 2001 From: satyaborg Date: Thu, 28 May 2026 13:48:51 +1000 Subject: [PATCH 1/2] feat: require evidence-rich review matrices --- README.md | 3 ++- src/devloop.ts | 4 ++-- tests/devloop.test.ts | 24 ++++++++++++++++++++---- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index dbb818b..2bbf38e 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,8 @@ By default, `devloop`: - uses Codex as the coder and Claude Code as the reviewer - uses the TUI in a terminal and plain output elsewhere - writes an HTML report -- requires the reviewer to pass every acceptance criterion +- requires the reviewer to pass every acceptance criterion with implementation and test evidence +- asks the reviewer to flag silent decisions, scope drift, and missing tests - creates an isolated sibling git worktree and runs agents there - creates a local branch and commit when the run is accepted - never pushes or opens a PR diff --git a/src/devloop.ts b/src/devloop.ts index dab01e7..83a71ad 100644 --- a/src/devloop.ts +++ b/src/devloop.ts @@ -209,7 +209,7 @@ export function hasPassingMatrix(review: string, count: number) { if (!/^## Acceptance matrix\s*$/m.test(review)) return false; return Array.from( { length: count }, - (_, i) => new RegExp(`^-\\s*AC${i + 1}:\\s*PASS\\b`, "mi"), + (_, i) => new RegExp(`^\\|\\s*AC${i + 1}\\s*\\|\\s*PASS\\s*\\|`, "mi"), ).every((r) => r.test(review)); } @@ -1014,7 +1014,7 @@ function reviewPrompt(input: { criteria: string[]; strict: boolean; }) { - return `You are reviewing a ${agentLabel(input.coder)} implementation. Be a senior reviewer, not a linter.\n\nSpec: ${input.spec}\nTrack: ${input.track}\nBase: ${input.base}\nPass: ${input.pass}\nPrior reviews:\n${input.priors}\nAcceptance criteria:\n${criteriaBlock(input.criteria)}\nOutput path: ${input.output}\n\nSteps:\n1. Read the spec and track.\n2. Run: git diff ${input.base}...HEAD\n3. Read prior reviews so you do not repeat resolved findings.\n4. Write the review to ${input.output} using this exact format:\n\n# Review ${input.pass}\n\nVerdict: \n\n## Acceptance matrix\n\n- AC1: - \n\n## Findings\n\n1. [severity] - . Root cause: . Principle: .\n\n## Missing tests\n\n- \n\n## Fix instructions\n\n1. \n\n## Notes\n\n- \n\nRules:\n- The verdict line must appear verbatim.\n- ACCEPT requires every acceptance criterion PASS with concrete evidence.${input.strict ? "\n- ACCEPT also requires regression-test evidence, red/green evidence when behavior changed, passing full tests, and 100% coverage when coverage tooling exists." : ""}\n- For ACCEPT: Findings and Fix instructions bodies are "None".\n- Findings must explain WHY, not just WHAT.\n`; + return `You are reviewing a ${agentLabel(input.coder)} implementation. Be a senior reviewer, not a linter.\n\nSpec: ${input.spec}\nTrack: ${input.track}\nBase: ${input.base}\nPass: ${input.pass}\nPrior reviews:\n${input.priors}\nAcceptance criteria:\n${criteriaBlock(input.criteria)}\nOutput path: ${input.output}\n\nSteps:\n1. Read the spec and track.\n2. Run: git diff ${input.base}...HEAD\n3. Read prior reviews so you do not repeat resolved findings.\n4. Write the review to ${input.output} using this exact format:\n\n# Review ${input.pass}\n\nVerdict: \n\n## Acceptance matrix\n\n| Criterion | Status | Implementation evidence | Test evidence |\n| --- | --- | --- | --- |\n| AC1 | | | |\n\n## Review flags\n\n- Silent decision: - \n- Scope drift: - \n- Missing test: - \n\n## Findings\n\n1. [severity] - . Root cause: . Principle: .\n\n## Missing tests\n\n- \n\n## Fix instructions\n\n1. \n\n## Notes\n\n- \n\nRules:\n- The verdict line must appear verbatim.\n- ACCEPT requires every acceptance criterion PASS with concrete implementation evidence and concrete test evidence.${input.strict ? "\n- ACCEPT also requires regression-test evidence, red/green evidence when behavior changed, passing full tests, and 100% coverage when coverage tooling exists." : ""}\n- Flag a silent decision when the diff makes a tradeoff, default choice, compatibility choice, migration choice, or risk acceptance that is not recorded in the spec or track.\n- Flag scope drift when the diff changes behavior, public API, dependencies, files, or architecture outside the acceptance criteria, or includes a broad refactor not needed for the spec.\n- Flag a missing test when behavior changed without targeted test evidence, even if the full suite passed.\n- Use UNCLEAR only when spec ambiguity prevents a defensible ACCEPT or REJECT, and put the exact question in Notes.\n- For ACCEPT: Findings and Fix instructions bodies are "None".\n- Findings must explain WHY, not just WHAT.\n`; } async function synthesizeReport( diff --git a/tests/devloop.test.ts b/tests/devloop.test.ts index a6652bb..d89f2d9 100644 --- a/tests/devloop.test.ts +++ b/tests/devloop.test.ts @@ -2,7 +2,7 @@ import { afterAll, beforeEach, describe, expect, test } from "bun:test"; import { mkdir, mkdtemp, readFile, realpath, rm, stat, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; -import { parseArgs, parseCriteria, parseVerdict, reportFraming, runDevloop, welcome, type Event, type Options } from "../src/devloop.ts"; +import { hasPassingMatrix, parseArgs, parseCriteria, parseVerdict, reportFraming, runDevloop, welcome, type Event, type Options } from "../src/devloop.ts"; const root = await mkdtemp(path.join(tmpdir(), "devloop-test.")); let oldPath = process.env.PATH ?? ""; @@ -51,6 +51,9 @@ describe("parsing", () => { expect(parseCriteria("# Spec")).toEqual([]); expect(parseVerdict("Verdict: ACCEPT\n")).toBe("ACCEPT"); expect(parseVerdict("No verdict here\n")).toBe(""); + const review = "## Acceptance matrix\n\n| Criterion | Status | Implementation evidence | Test evidence |\n| --- | --- | --- | --- |\n| AC1 | PASS | code path | bun test |\n| AC2 | PASS | behavior | typecheck |\n"; + expect(hasPassingMatrix(review, 2)).toBe(true); + expect(hasPassingMatrix(review.replace("| AC2 | PASS |", "| AC2 | FAIL |"), 2)).toBe(false); }); test("derives report framing from the spec", () => { @@ -111,7 +114,7 @@ describe("loop", () => { expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain("- coder: codex"); expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain("- reviewer: claude"); expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain(`- source-repo: ${repo}`); - expect(await readFile(path.join(worktree, ".codex/reviews/change-r1.md"), "utf8")).toContain("- AC1: PASS"); + expect(await readFile(path.join(worktree, ".codex/reviews/change-r1.md"), "utf8")).toContain("| AC1 | PASS | mock evidence | mock test |"); const codexArgs = await readFile(path.join(state, "codex-args.log"), "utf8"); expect(codexArgs.split(/\r?\n/, 1)[0]).toBe(`exec -s read-only -C ${repo} -`); expect(codexArgs).toContain(`exec --dangerously-bypass-approvals-and-sandbox -C ${worktree} -`); @@ -134,6 +137,13 @@ describe("loop", () => { expect(reportPrompt).toContain("Haiku topic: Fixture spec - The loop runs deterministically under test."); expect(reportPrompt).toContain("rendered immediately after the subtitle before Metadata"); expect(reportPrompt).toContain("The subtitle must be specific to this work"); + expect(reportPrompt).toContain("| Criterion | Status | Implementation evidence | Test evidence |"); + expect(reportPrompt).toContain("## Review flags"); + expect(reportPrompt).toContain("Silent decision:"); + expect(reportPrompt).toContain("Scope drift:"); + expect(reportPrompt).toContain("Missing test:"); + expect(reportPrompt).toContain("Flag scope drift when the diff changes behavior"); + expect(reportPrompt).toContain("Use UNCLEAR only when spec ambiguity prevents a defensible ACCEPT or REJECT"); expect(events).toContainEqual({ type: "done", id: "naming", ok: true, detail: "feat/change" }); expect(events).toContainEqual({ type: "done", id: "worktree", ok: true, detail: worktree }); expect(events.some((event) => event.type === "gate" && event.name === "acceptance criteria" && event.ok)).toBe(true); @@ -521,8 +531,11 @@ if [[ "$prompt" == *"Output path:"* ]]; then [[ -n "\${DEVLOOP_TEST_NO_VERDICT:-}" ]] || printf 'Verdict: %s\\n\\n' "$verdict" if [[ -z "\${DEVLOOP_TEST_NO_MATRIX:-}" ]]; then printf '## Acceptance matrix\\n\\n' - printf -- '- AC1: PASS - mock evidence\\n\\n' + printf '| Criterion | Status | Implementation evidence | Test evidence |\\n' + printf '| --- | --- | --- | --- |\\n' + printf '| AC1 | PASS | mock evidence | mock test |\\n\\n' fi + printf '## Review flags\\n\\n- Silent decision: absent - None\\n- Scope drift: absent - None\\n- Missing test: absent - None\\n\\n' printf '## Findings\\n\\n' if [[ "$verdict" == "ACCEPT" ]]; then printf 'None\\n\\n'; else printf '1. [must-fix] devloop.ts:1 - repeated fixture finding. Root cause: mock review. Principle: deterministic retry behavior.\\n\\n'; fi printf '## Missing tests\\n\\n- None\\n\\n## Fix instructions\\n\\n' @@ -583,8 +596,11 @@ if [[ "$prompt" == *"Output path:"* ]]; then [[ -n "\${DEVLOOP_TEST_NO_VERDICT:-}" ]] || printf 'Verdict: %s\\n\\n' "$verdict" if [[ -z "\${DEVLOOP_TEST_NO_MATRIX:-}" ]]; then printf '## Acceptance matrix\\n\\n' - printf -- '- AC1: PASS - mock evidence\\n\\n' + printf '| Criterion | Status | Implementation evidence | Test evidence |\\n' + printf '| --- | --- | --- | --- |\\n' + printf '| AC1 | PASS | mock evidence | mock test |\\n\\n' fi + printf '## Review flags\\n\\n- Silent decision: absent - None\\n- Scope drift: absent - None\\n- Missing test: absent - None\\n\\n' printf '## Findings\\n\\n' if [[ "$verdict" == "ACCEPT" ]]; then printf 'None\\n\\n'; else printf '1. [must-fix] devloop.ts:1 - repeated fixture finding. Root cause: mock review. Principle: deterministic retry behavior.\\n\\n'; fi printf '## Missing tests\\n\\n- None\\n\\n## Fix instructions\\n\\n' From 9aa1d2b111db9988bc34a01ef73fd67eb693aae6 Mon Sep 17 00:00:00 2001 From: satyaborg Date: Thu, 28 May 2026 13:50:45 +1000 Subject: [PATCH 2/2] chore: show logo in readme --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2bbf38e..80e15bf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,11 @@ -# devloop +```text + __ __ + ____/ /__ _ __/ /___ ____ ____ + / __ / _ \ | / / / __ \/ __ \/ __ \ +/ /_/ / __/ |/ / / /_/ / /_/ / /_/ / +\__,_/\___/|___/_/\____/\____/ .___/ + /_/ +``` `devloop` runs a configurable implementation and review loop. By default, Codex codes and Claude Code reviews.