From 1f31f7077da79fcca2ec1219252c3d0e7ce37373 Mon Sep 17 00:00:00 2001
From: satyaborg <satya.borg@gmail.com>
Date: Thu, 28 May 2026 13:48:51 +1000
Subject: [PATCH 1/2] feat: require evidence-rich review matrices

---
 README.md             |  3 ++-
 src/devloop.ts        |  4 ++--
 tests/devloop.test.ts | 24 ++++++++++++++++++++----
 3 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index dbb818b..2bbf38e 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,8 @@ By default, `devloop`:
 - uses Codex as the coder and Claude Code as the reviewer
 - uses the TUI in a terminal and plain output elsewhere
 - writes an HTML report
-- requires the reviewer to pass every acceptance criterion
+- requires the reviewer to pass every acceptance criterion with implementation and test evidence
+- asks the reviewer to flag silent decisions, scope drift, and missing tests
 - creates an isolated sibling git worktree and runs agents there
 - creates a local branch and commit when the run is accepted
 - never pushes or opens a PR
diff --git a/src/devloop.ts b/src/devloop.ts
index dab01e7..83a71ad 100644
--- a/src/devloop.ts
+++ b/src/devloop.ts
@@ -209,7 +209,7 @@ export function hasPassingMatrix(review: string, count: number) {
   if (!/^## Acceptance matrix\s*$/m.test(review)) return false;
   return Array.from(
     { length: count },
-    (_, i) => new RegExp(`^-\\s*AC${i + 1}:\\s*PASS\\b`, "mi"),
+    (_, i) => new RegExp(`^\\|\\s*AC${i + 1}\\s*\\|\\s*PASS\\s*\\|`, "mi"),
   ).every((r) => r.test(review));
 }
 
@@ -1014,7 +1014,7 @@ function reviewPrompt(input: {
   criteria: string[];
   strict: boolean;
 }) {
-  return `You are reviewing a ${agentLabel(input.coder)} implementation. Be a senior reviewer, not a linter.\n\nSpec: ${input.spec}\nTrack: ${input.track}\nBase: ${input.base}\nPass: ${input.pass}\nPrior reviews:\n${input.priors}\nAcceptance criteria:\n${criteriaBlock(input.criteria)}\nOutput path: ${input.output}\n\nSteps:\n1. Read the spec and track.\n2. Run: git diff ${input.base}...HEAD\n3. Read prior reviews so you do not repeat resolved findings.\n4. Write the review to ${input.output} using this exact format:\n\n# Review ${input.pass}\n\nVerdict: <ACCEPT | REJECT | UNCLEAR>\n\n## Acceptance matrix\n\n- AC1: <PASS | FAIL | UNCLEAR> - <evidence>\n\n## Findings\n\n1. [severity] <file:line> - <symptom>. Root cause: <why>. Principle: <principle>.\n\n## Missing tests\n\n- <gap, or None>\n\n## Fix instructions\n\n1. <standalone instruction>\n\n## Notes\n\n- <scope, disputes, lessons, or None>\n\nRules:\n- The verdict line must appear verbatim.\n- ACCEPT requires every acceptance criterion PASS with concrete evidence.${input.strict ? "\n- ACCEPT also requires regression-test evidence, red/green evidence when behavior changed, passing full tests, and 100% coverage when coverage tooling exists." : ""}\n- For ACCEPT: Findings and Fix instructions bodies are "None".\n- Findings must explain WHY, not just WHAT.\n`;
+  return `You are reviewing a ${agentLabel(input.coder)} implementation. Be a senior reviewer, not a linter.\n\nSpec: ${input.spec}\nTrack: ${input.track}\nBase: ${input.base}\nPass: ${input.pass}\nPrior reviews:\n${input.priors}\nAcceptance criteria:\n${criteriaBlock(input.criteria)}\nOutput path: ${input.output}\n\nSteps:\n1. Read the spec and track.\n2. Run: git diff ${input.base}...HEAD\n3. Read prior reviews so you do not repeat resolved findings.\n4. Write the review to ${input.output} using this exact format:\n\n# Review ${input.pass}\n\nVerdict: <ACCEPT | REJECT | UNCLEAR>\n\n## Acceptance matrix\n\n| Criterion | Status | Implementation evidence | Test evidence |\n| --- | --- | --- | --- |\n| AC1 | <PASS, FAIL, or UNCLEAR> | <code or behavior evidence> | <test, check, or explicit gap> |\n\n## Review flags\n\n- Silent decision: <present or absent> - <evidence, or None>\n- Scope drift: <present or absent> - <evidence, or None>\n- Missing test: <present or absent> - <evidence, or None>\n\n## Findings\n\n1. [severity] <file:line> - <symptom>. Root cause: <why>. Principle: <principle>.\n\n## Missing tests\n\n- <gap, or None>\n\n## Fix instructions\n\n1. <standalone instruction>\n\n## Notes\n\n- <scope, disputes, ambiguity questions, lessons, or None>\n\nRules:\n- The verdict line must appear verbatim.\n- ACCEPT requires every acceptance criterion PASS with concrete implementation evidence and concrete test evidence.${input.strict ? "\n- ACCEPT also requires regression-test evidence, red/green evidence when behavior changed, passing full tests, and 100% coverage when coverage tooling exists." : ""}\n- Flag a silent decision when the diff makes a tradeoff, default choice, compatibility choice, migration choice, or risk acceptance that is not recorded in the spec or track.\n- Flag scope drift when the diff changes behavior, public API, dependencies, files, or architecture outside the acceptance criteria, or includes a broad refactor not needed for the spec.\n- Flag a missing test when behavior changed without targeted test evidence, even if the full suite passed.\n- Use UNCLEAR only when spec ambiguity prevents a defensible ACCEPT or REJECT, and put the exact question in Notes.\n- For ACCEPT: Findings and Fix instructions bodies are "None".\n- Findings must explain WHY, not just WHAT.\n`;
 }
 
 async function synthesizeReport(
diff --git a/tests/devloop.test.ts b/tests/devloop.test.ts
index a6652bb..d89f2d9 100644
--- a/tests/devloop.test.ts
+++ b/tests/devloop.test.ts
@@ -2,7 +2,7 @@ import { afterAll, beforeEach, describe, expect, test } from "bun:test";
 import { mkdir, mkdtemp, readFile, realpath, rm, stat, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
-import { parseArgs, parseCriteria, parseVerdict, reportFraming, runDevloop, welcome, type Event, type Options } from "../src/devloop.ts";
+import { hasPassingMatrix, parseArgs, parseCriteria, parseVerdict, reportFraming, runDevloop, welcome, type Event, type Options } from "../src/devloop.ts";
 
 const root = await mkdtemp(path.join(tmpdir(), "devloop-test."));
 let oldPath = process.env.PATH ?? "";
@@ -51,6 +51,9 @@ describe("parsing", () => {
     expect(parseCriteria("# Spec")).toEqual([]);
     expect(parseVerdict("Verdict: ACCEPT\n")).toBe("ACCEPT");
     expect(parseVerdict("No verdict here\n")).toBe("");
+    const review = "## Acceptance matrix\n\n| Criterion | Status | Implementation evidence | Test evidence |\n| --- | --- | --- | --- |\n| AC1 | PASS | code path | bun test |\n| AC2 | PASS | behavior | typecheck |\n";
+    expect(hasPassingMatrix(review, 2)).toBe(true);
+    expect(hasPassingMatrix(review.replace("| AC2 | PASS |", "| AC2 | FAIL |"), 2)).toBe(false);
   });
 
   test("derives report framing from the spec", () => {
@@ -111,7 +114,7 @@ describe("loop", () => {
     expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain("- coder: codex");
     expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain("- reviewer: claude");
     expect(await readFile(path.join(worktree, ".codex/tracks/change.md"), "utf8")).toContain(`- source-repo: ${repo}`);
-    expect(await readFile(path.join(worktree, ".codex/reviews/change-r1.md"), "utf8")).toContain("- AC1: PASS");
+    expect(await readFile(path.join(worktree, ".codex/reviews/change-r1.md"), "utf8")).toContain("| AC1 | PASS | mock evidence | mock test |");
     const codexArgs = await readFile(path.join(state, "codex-args.log"), "utf8");
     expect(codexArgs.split(/\r?\n/, 1)[0]).toBe(`exec -s read-only -C ${repo} -`);
     expect(codexArgs).toContain(`exec --dangerously-bypass-approvals-and-sandbox -C ${worktree} -`);
@@ -134,6 +137,13 @@ describe("loop", () => {
     expect(reportPrompt).toContain("Haiku topic: Fixture spec - The loop runs deterministically under test.");
     expect(reportPrompt).toContain("rendered immediately after the subtitle before Metadata");
     expect(reportPrompt).toContain("The subtitle must be specific to this work");
+    expect(reportPrompt).toContain("| Criterion | Status | Implementation evidence | Test evidence |");
+    expect(reportPrompt).toContain("## Review flags");
+    expect(reportPrompt).toContain("Silent decision:");
+    expect(reportPrompt).toContain("Scope drift:");
+    expect(reportPrompt).toContain("Missing test:");
+    expect(reportPrompt).toContain("Flag scope drift when the diff changes behavior");
+    expect(reportPrompt).toContain("Use UNCLEAR only when spec ambiguity prevents a defensible ACCEPT or REJECT");
     expect(events).toContainEqual({ type: "done", id: "naming", ok: true, detail: "feat/change" });
     expect(events).toContainEqual({ type: "done", id: "worktree", ok: true, detail: worktree });
     expect(events.some((event) => event.type === "gate" && event.name === "acceptance criteria" && event.ok)).toBe(true);
@@ -521,8 +531,11 @@ if [[ "$prompt" == *"Output path:"* ]]; then
     [[ -n "\${DEVLOOP_TEST_NO_VERDICT:-}" ]] || printf 'Verdict: %s\\n\\n' "$verdict"
     if [[ -z "\${DEVLOOP_TEST_NO_MATRIX:-}" ]]; then
       printf '## Acceptance matrix\\n\\n'
-      printf -- '- AC1: PASS - mock evidence\\n\\n'
+      printf '| Criterion | Status | Implementation evidence | Test evidence |\\n'
+      printf '| --- | --- | --- | --- |\\n'
+      printf '| AC1 | PASS | mock evidence | mock test |\\n\\n'
     fi
+    printf '## Review flags\\n\\n- Silent decision: absent - None\\n- Scope drift: absent - None\\n- Missing test: absent - None\\n\\n'
     printf '## Findings\\n\\n'
     if [[ "$verdict" == "ACCEPT" ]]; then printf 'None\\n\\n'; else printf '1. [must-fix] devloop.ts:1 - repeated fixture finding. Root cause: mock review. Principle: deterministic retry behavior.\\n\\n'; fi
     printf '## Missing tests\\n\\n- None\\n\\n## Fix instructions\\n\\n'
@@ -583,8 +596,11 @@ if [[ "$prompt" == *"Output path:"* ]]; then
     [[ -n "\${DEVLOOP_TEST_NO_VERDICT:-}" ]] || printf 'Verdict: %s\\n\\n' "$verdict"
     if [[ -z "\${DEVLOOP_TEST_NO_MATRIX:-}" ]]; then
       printf '## Acceptance matrix\\n\\n'
-      printf -- '- AC1: PASS - mock evidence\\n\\n'
+      printf '| Criterion | Status | Implementation evidence | Test evidence |\\n'
+      printf '| --- | --- | --- | --- |\\n'
+      printf '| AC1 | PASS | mock evidence | mock test |\\n\\n'
     fi
+    printf '## Review flags\\n\\n- Silent decision: absent - None\\n- Scope drift: absent - None\\n- Missing test: absent - None\\n\\n'
     printf '## Findings\\n\\n'
     if [[ "$verdict" == "ACCEPT" ]]; then printf 'None\\n\\n'; else printf '1. [must-fix] devloop.ts:1 - repeated fixture finding. Root cause: mock review. Principle: deterministic retry behavior.\\n\\n'; fi
     printf '## Missing tests\\n\\n- None\\n\\n## Fix instructions\\n\\n'

From 9aa1d2b111db9988bc34a01ef73fd67eb693aae6 Mon Sep 17 00:00:00 2001
From: satyaborg <satya.borg@gmail.com>
Date: Thu, 28 May 2026 13:50:45 +1000
Subject: [PATCH 2/2] chore: show logo in readme

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2bbf38e..80e15bf 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,11 @@
-# devloop
+```text
+       __          __
+  ____/ /__ _   __/ /___  ____  ____
+ / __  / _ \ | / / / __ \/ __ \/ __ \
+/ /_/ /  __/ |/ / / /_/ / /_/ / /_/ /
+\__,_/\___/|___/_/\____/\____/ .___/
+                            /_/
+```
 
 `devloop` runs a configurable implementation and review loop. By default, Codex codes and Claude Code reviews.