rlorenzo · rlorenzo · Mar 21, 2026 · Mar 20, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -13,6 +13,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: true
       - uses: actions/setup-python@v5
         with:
           python-version: "3.x"

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "test/bats/bats-core"]
+	path = test/bats/bats-core
+	url = https://github.com/bats-core/bats-core.git
+[submodule "test/bats/bats-support"]
+	path = test/bats/bats-support
+	url = https://github.com/bats-core/bats-support.git
+[submodule "test/bats/bats-assert"]
+	path = test/bats/bats-assert
+	url = https://github.com/bats-core/bats-assert.git
diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
@@ -1,2 +1,4 @@
 config:
   MD013: false # Line length — instructional prose runs long intentionally
+ignores:
+  - "test/fixtures/**"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,3 +16,21 @@ repos:
     rev: v0.21.0
     hooks:
       - id: markdownlint-cli2
+
+  - repo: local
+    hooks:
+      - id: bats-unit-tests
+        name: BATS unit tests
+        entry: test/run
+        language: script
+        always_run: true
+        pass_filenames: false
+        stages: [pre-commit]
+
+      - id: smoke-tests
+        name: Smoke tests (real agents)
+        entry: test/smoke
+        language: script
+        always_run: true
+        pass_filenames: false
+        stages: [pre-push]
diff --git a/README.md b/README.md
@@ -134,6 +134,37 @@ The setup script only manages commands it originally installed.
 
 ## Contributing
 
+### Running Tests
+
+The test suite uses [BATS](https://github.com/bats-core/bats-core) (Bash Automated Testing System). After cloning with submodules:
+
+```bash
+git clone --recurse-submodules https://github.com/rlorenzo/ai-coding-setup.git
+cd ai-coding-setup
+test/run
+```
+
+If you already cloned without submodules:
+
+```bash
+git submodule update --init --recursive
+test/run
+```
+
+Unit tests (`test/run`) cover config parsing, prompt loading, validation, and review status checks. They run in seconds and need no API keys.
+
+### Smoke Tests
+
+Smoke tests run real AI agents against a temporary git repo to verify that CLI flags are accepted and agents can perform basic read/write tasks:
+
+```bash
+test/smoke                   # test all installed agents
+test/smoke claude codex      # test specific agents
+test/smoke --timeout 180     # override per-test timeout (default: 120s)
+```
+
+Each installed agent is tested as both editor (can it modify a file?) and reviewer (does it produce a review file?). Requires at least one AI tool installed and authenticated.
+
 ### Pre-commit hooks (optional)
 
 This repo uses [pre-commit](https://pre-commit.com/) to run linters locally before each commit. Install it once and you'll get automatic checks for shell scripts (shellcheck), markdown (markdownlint), and TOML syntax.

diff --git a/bin/code-review-loop b/bin/code-review-loop
@@ -45,7 +45,7 @@ Options:
   -r, --reviewer AGENT     Agent for code review (default: codex)
   -h, --help               Show this help message
 
-Agents: claude, codex, gemini
+Agents: claude, codex, gemini, copilot
 EOF
     exit 0
 }
@@ -84,32 +84,6 @@ REVIEWER_TOOLS="Read,Write,Bash,Grep,Glob"
 
 # ---- helpers -------------------------------------------------------------
 
-test_review_clean() {
-    [[ -f "$REVIEW_FILE" ]] || return 1
-    local content
-    content=$(<"$REVIEW_FILE")
-
-    # Only an explicit "Verdict: good to go" line signals clean — a bare
-    # substring match could false-positive on "not good to go"
-    if echo "$content" | grep -qiE "verdict[[:space:]]*:[[:space:]]*good to go"; then
-        return 0
-    fi
-
-    return 1
-}
-
-get_review_issue_counts() {
-    [[ -f "$REVIEW_FILE" ]] || { echo "No review file"; return; }
-    local content high medium low
-    content=$(<"$REVIEW_FILE")
-
-    high=$(echo "$content" | grep -oE "High[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
-    medium=$(echo "$content" | grep -oE "Medium[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
-    low=$(echo "$content" | grep -oE "Low[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
-
-    echo "High: ${high:-?}, Medium: ${medium:-?}, Low: ${low:-?}"
-}
-
 stage_review_changes() {
     # Re-stage files that are in the review scope (partially staged files are
     # rejected at startup, so git add here is safe)

diff --git a/bin/plan-review-loop b/bin/plan-review-loop
@@ -46,7 +46,7 @@ Options:
   -r, --reviewer AGENT     Agent for plan review (default: codex)
   -h, --help               Show this help message
 
-Agents: claude, codex, gemini
+Agents: claude, codex, gemini, copilot
 
 Examples:
   plan-review-loop PLAN-feature.md
@@ -97,60 +97,6 @@ REVIEWER_FOLLOWUP_PROMPT="$PROMPTS_DIR/plan-review-followup.md"
 EDITOR_TOOLS="Read,Write,Edit,Grep,Glob"
 REVIEWER_TOOLS="Read,Write,Grep,Glob"
 
-# ---- helpers -------------------------------------------------------------
-
-test_reviewer_satisfied() {
-    [[ -f "$FEEDBACK_FILE" ]] || return 1
-    # Match sentinel as a standalone line to avoid false positives from
-    # feedback that merely mentions the token in explanatory text
-    local trimmed
-    trimmed=$(sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e '/^$/d' "$FEEDBACK_FILE")
-    [[ "$trimmed" == "NO_FURTHER_FEEDBACK" ]]
-}
-
-build_improvement_prompt() {
-    local plan_path="$1" feedback_path="$2" reviewer_name="$3" cycle="$4" max_cycles="$5"
-    cat <<PROMPT
-You are a senior software architect improving a plan document based on reviewer feedback.
-
-## Your Task
-
-1. **Read** the plan file: $plan_path
-2. **Read** the feedback file: $feedback_path
-3. **Evaluate** each feedback point critically. You are NOT obligated to accept every suggestion. Use your judgment:
-   - Accept and implement feedback that genuinely improves completeness, feasibility, clarity, or risk coverage.
-   - Decline feedback that is subjective preference, out of scope, or would degrade the plan.
-4. **Edit** the plan file directly to incorporate valid improvements. Do not rewrite sections that are already well-structured unless the feedback identifies a real gap.
-5. **Write** a response to $feedback_path (overwrite it) explaining your actions:
-   - For each accepted point: briefly describe what was changed and where.
-   - For each declined point: explain why (e.g., out of scope, already covered, disagree with reasoning).
-
-## Response Format
-
-Use these markers for each feedback item in your response:
-
-- Accepted: \`✅ Implemented: [brief description of what was changed]\`
-- Deferred: \`📝 Response: [reason for deferring, alternatives]\`
-- Declined: \`❌ Clarification: [reason for disagreeing]\`
-
-## Guidelines
-
-- Preserve the plan's existing structure and formatting conventions.
-- Do not add unnecessary boilerplate or padding.
-- Focus on substance: missing edge cases, risk gaps, unclear requirements, feasibility concerns.
-- If the reviewer's feedback is entirely minor or already addressed, say so clearly in your response.
-- This is cycle $cycle of $max_cycles with the $reviewer_name reviewer.
-
-## Output
-
-Your only output files are:
-- The improved plan file (edited in place)
-- The feedback response file (overwritten with your response)
-
-Do not create any other files. Do not stage or commit anything.
-PROMPT
-}
-
 # ---- validation ----------------------------------------------------------
 if [[ ! -f "$PLAN_FILE_PATH" ]]; then
     echo -e "${RED}ERROR: Plan file not found: $PLAN_FILE_PATH${NC}"
@@ -172,8 +118,8 @@ echo -e "${MAGENTA} Plan Review Loop${NC}"
 echo -e "${MAGENTA}========================================${NC}"
 echo " Plan file      : $PLAN_FILE"
 echo " Max iterations : $MAX_ITERATIONS"
-echo " Editor          : $EDITOR_AGENT"
-echo " Reviewer        : $REVIEWER_AGENT"
+echo " Editor         : $EDITOR_AGENT"
+echo " Reviewer       : $REVIEWER_AGENT"
 
 start_time=$(date +%s)
 iteration_data=""

diff --git a/lib/lib-review-loop b/lib/lib-review-loop
@@ -240,6 +240,101 @@ validate_prompts() {
     fi
 }
 
+# ---- review status checks ------------------------------------------------
+
+# Check whether a code review is clean (no actionable issues).
+# Requires REVIEW_FILE to be set by the caller.
+# Returns 0 if the review file contains "Verdict: good to go", 1 otherwise.
+test_review_clean() {
+    [[ -f "$REVIEW_FILE" ]] || return 1
+    local content
+    content=$(<"$REVIEW_FILE")
+
+    # Only an explicit "Verdict: good to go" line signals clean — a bare
+    # substring match could false-positive on "not good to go"
+    if echo "$content" | grep -qiE "verdict[[:space:]]*:[[:space:]]*good to go"; then
+        return 0
+    fi
+
+    return 1
+}
+
+# Extract issue counts from a code review file.
+# Requires REVIEW_FILE to be set by the caller.
+# Prints: "High: N, Medium: N, Low: N" (or ? for missing counts)
+get_review_issue_counts() {
+    [[ -f "$REVIEW_FILE" ]] || { echo "No review file"; return; }
+    local content high medium low
+    content=$(<"$REVIEW_FILE")
+
+    high=$(echo "$content" | grep -oE "High[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
+    medium=$(echo "$content" | grep -oE "Medium[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
+    low=$(echo "$content" | grep -oE "Low[[:space:]]*:?[[:space:]]*[0-9]+" | grep -oE "[0-9]+" | head -1) || true
+
+    echo "High: ${high:-?}, Medium: ${medium:-?}, Low: ${low:-?}"
+}
+
+# Check whether a plan reviewer is satisfied (no further feedback).
+# Requires FEEDBACK_FILE to be set by the caller.
+# Returns 0 if the feedback file contains only "NO_FURTHER_FEEDBACK", 1 otherwise.
+test_reviewer_satisfied() {
+    [[ -f "$FEEDBACK_FILE" ]] || return 1
+    # Match sentinel as a standalone line to avoid false positives from
+    # feedback that merely mentions the token in explanatory text
+    local trimmed
+    trimmed=$(sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e '/^$/d' "$FEEDBACK_FILE")
+    [[ "$trimmed" == "NO_FURTHER_FEEDBACK" ]]
+}
+
+# Build the improvement prompt for the plan editor.
+#   $1 — plan file path
+#   $2 — feedback file path
+#   $3 — reviewer agent name
+#   $4 — current cycle number
+#   $5 — max cycles
+build_improvement_prompt() {
+    local plan_path="$1" feedback_path="$2" reviewer_name="$3" cycle="$4" max_cycles="$5"
+    cat <<PROMPT
+You are a senior software architect improving a plan document based on reviewer feedback.
+
+## Your Task
+
+1. **Read** the plan file: $plan_path
+2. **Read** the feedback file: $feedback_path
+3. **Evaluate** each feedback point critically. You are NOT obligated to accept every suggestion. Use your judgment:
+   - Accept and implement feedback that genuinely improves completeness, feasibility, clarity, or risk coverage.
+   - Decline feedback that is subjective preference, out of scope, or would degrade the plan.
+4. **Edit** the plan file directly to incorporate valid improvements. Do not rewrite sections that are already well-structured unless the feedback identifies a real gap.
+5. **Write** a response to $feedback_path (overwrite it) explaining your actions:
+   - For each accepted point: briefly describe what was changed and where.
+   - For each declined point: explain why (e.g., out of scope, already covered, disagree with reasoning).
+
+## Response Format
+
+Use these markers for each feedback item in your response:
+
+- Accepted: \`✅ Implemented: [brief description of what was changed]\`
+- Deferred: \`📝 Response: [reason for deferring, alternatives]\`
+- Declined: \`❌ Clarification: [reason for disagreeing]\`
+
+## Guidelines
+
+- Preserve the plan's existing structure and formatting conventions.
+- Do not add unnecessary boilerplate or padding.
+- Focus on substance: missing edge cases, risk gaps, unclear requirements, feasibility concerns.
+- If the reviewer's feedback is entirely minor or already addressed, say so clearly in your response.
+- This is cycle $cycle of $max_cycles with the $reviewer_name reviewer.
+
+## Output
+
+Your only output files are:
+- The improved plan file (edited in place)
+- The feedback response file (overwritten with your response)
+
+Do not create any other files. Do not stage or commit anything.
+PROMPT
+}
+
 # ---- temp directory with cleanup -----------------------------------------
 
 # Create a temp directory and register a cleanup trap.

diff --git a/test/bats/bats-assert b/test/bats/bats-assert
diff --git a/test/bats/bats-core b/test/bats/bats-core
diff --git a/test/bats/bats-support b/test/bats/bats-support
diff --git a/test/code-review-loop.bats b/test/code-review-loop.bats
@@ -0,0 +1,35 @@
+#!/usr/bin/env bats
+# Tests for bin/code-review-loop — argument parsing and validation.
+
+load test_helper
+
+BIN="$PROJECT_ROOT/bin/code-review-loop"
+
+@test "code-review-loop --help prints usage and exits 0" {
+    run "$BIN" --help
+    assert_success
+    assert_output --partial "Usage: code-review-loop"
+}
+
+@test "code-review-loop -h prints usage and exits 0" {
+    run "$BIN" -h
+    assert_success
+    assert_output --partial "Usage: code-review-loop"
+}
+
+@test "code-review-loop rejects unknown options" {
+    run "$BIN" --bogus
+    assert_output --partial "Unknown option"
+}
+
+@test "code-review-loop --max-iterations without value shows error" {
+    run "$BIN" -m
+    assert_output --partial "requires a value"
+}
+
+@test "code-review-loop rejects invalid agent name" {
+    # Prompt files won't exist, but agent validation happens first
+    run "$BIN" --editor gpt4
+    assert_failure
+    assert_output --partial "Unknown agent"
+}
diff --git a/test/fixtures/prompt-blank-lines.md b/test/fixtures/prompt-blank-lines.md
@@ -0,0 +1,5 @@
+
+
+  Indented content here.
+
+Another line.
diff --git a/test/fixtures/prompt-no-frontmatter.md b/test/fixtures/prompt-no-frontmatter.md
@@ -0,0 +1 @@
+Review the code for correctness and clarity.
diff --git a/test/fixtures/prompt-with-frontmatter.md b/test/fixtures/prompt-with-frontmatter.md
@@ -0,0 +1,6 @@
+---
+name: test-prompt
+description: A test prompt with frontmatter
+---
+
+Review the code for correctness and clarity.
diff --git a/test/fixtures/review-clean.md b/test/fixtures/review-clean.md
@@ -0,0 +1,9 @@
+# Code Review
+
+All checks passed. No issues found.
+
+Verdict: good to go
+
+High: 0
+Medium: 0
+Low: 0
diff --git a/test/fixtures/review-issues.md b/test/fixtures/review-issues.md
@@ -0,0 +1,9 @@
+# Code Review
+
+Several issues found during review.
+
+Verdict: needs work
+
+High: 2
+Medium: 3
+Low: 1