From 5a854be7d5cfc5da06c641e92b0dcc0b13795e79 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 11:58:48 +0100 Subject: [PATCH 01/21] docs(updater): PR 2 (Tier 2 manual-click) implementation plan 20-task TDD plan for shipping the manual-click update flow on top of the Tier 1 (notify) work merged in #7601. Covers UpdateExecutor, RollbackHandler, SessionDrainer, lock + trustedKeys, four admin endpoints (apply / cancel / acknowledge / log), admin UI updates, integration tests against a tmp git repo, and a manual smoke runbook for the spec's "before each tier ships" gate. Plan deliberately scopes signature verification to an opt-in stub (updates.requireSignature: false default) to avoid blocking on a separate release-signing project. Plan: docs/superpowers/plans/2026-05-08-auto-update-pr2-manual-click.md Spec: docs/superpowers/specs/2026-04-25-auto-update-design.md Issue: ether/etherpad#7607 Co-Authored-By: Claude Opus 4.7 (1M context) --- ...2026-05-08-auto-update-pr2-manual-click.md | 3222 +++++++++++++++++ 1 file changed, 3222 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-08-auto-update-pr2-manual-click.md diff --git a/docs/superpowers/plans/2026-05-08-auto-update-pr2-manual-click.md b/docs/superpowers/plans/2026-05-08-auto-update-pr2-manual-click.md new file mode 100644 index 00000000000..2840f0ff214 --- /dev/null +++ b/docs/superpowers/plans/2026-05-08-auto-update-pr2-manual-click.md @@ -0,0 +1,3222 @@ +# Auto-Update PR 2 — Tier 2 (Manual Click) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ship Tier 2 of the four-tier auto-update subsystem: an admin can click "Apply now" on the existing `/admin/update` page, Etherpad drains active sessions for 60s, runs `git fetch / checkout / pnpm install --frozen-lockfile / pnpm run build:ui`, exits 75 for a process supervisor to restart, and on the next boot a health-check timer either marks the update verified or rolls back. + +**Architecture:** Build atomic primitives (lock, executor, rollback, drainer) under `src/node/updater/`, expose four admin-only state-changing endpoints (`apply`, `cancel`, `acknowledge`, `log`) plus log-tail streaming, wire RollbackHandler into the boot sequence, and extend the existing `/admin/update` page with an Apply button + log view + terminal-state acknowledgement UI. Every executable step goes through dependency-injected `spawn`/`fetch`/`fs` so we can run the full pipeline in tests against a tmp git repo without mutating the real install. + +**Tech Stack:** TypeScript (Node 20+), `child_process.spawn`, `node:fs/promises`, log4js (rolling-file appender), express + supertest (mocha integration), vitest (unit), React + zustand + react-i18next (admin UI), Playwright (admin E2E). + +**Spec:** `docs/superpowers/specs/2026-04-25-auto-update-design.md` (sections "Architecture / Components", "API surface / Tier 2 — manual click", "Error handling", "Phased rollout / PR 2"). + +**Out of scope (deferred):** Tier 3 Scheduler + grace window, Tier 4 MaintenanceWindow, real GPG signature verification (we ship a feature-flagged stub gated by `updates.requireSignature: false`; documented as follow-up). + +--- + +## File Structure + +### New files +- `src/node/updater/lock.ts` — PID-based file lock (`var/update.lock`), stale-pid reaper. +- `src/node/updater/trustedKeys.ts` — release-tag signature verification (stubbed unless `requireSignature: true`). +- `src/node/updater/preflight.ts` — pure-ish pre-flight checks (working tree clean, disk space, lock free, install method writable, target tag exists, sig verifies). +- `src/node/updater/UpdateExecutor.ts` — child-process orchestration (snapshot → fetch → checkout → install → build → exit 75). All shell-outs go through an injected `spawnFn`. +- `src/node/updater/RollbackHandler.ts` — boot-time pending-verification check, 60s health timer, crash-loop guard, restore SHA + lockfile + retry install on failure. +- `src/node/updater/SessionDrainer.ts` — broadcasts shoutMessage at T-60/-30/-10, refuses new socket connections via a module flag. +- `src/node/updater/updateLog.ts` — log4js rolling-file appender pointed at `var/log/update.log` (10MB × 5) + `tailLines(n)` helper. +- `src/node/hooks/express/updateActions.ts` — registers `POST /admin/update/{apply,cancel,acknowledge}` and `GET /admin/update/log`. Strict admin auth on all four. +- `src/tests/backend-new/specs/updater/lock.test.ts` +- `src/tests/backend-new/specs/updater/preflight.test.ts` +- `src/tests/backend-new/specs/updater/UpdateExecutor.test.ts` +- `src/tests/backend-new/specs/updater/RollbackHandler.test.ts` +- `src/tests/backend-new/specs/updater/SessionDrainer.test.ts` +- `src/tests/backend-new/specs/updater/updateLog.test.ts` +- `src/tests/backend/specs/updateActions.ts` — mocha integration tests for apply/cancel/acknowledge/log. +- `src/tests/backend/specs/updater-integration.ts` — end-to-end against a tmp git repo (happy path, install-fail rollback, build-fail rollback, health-check timeout, crash-loop forced rollback, terminal `rollback-failed` blocks auto/autonomous but allows manual). +- `src/tests/frontend-new/admin-spec/update-page-actions.spec.ts` — Playwright: Apply button, log stream visibility, terminal-state Acknowledge, refusal when policy denies. +- `doc/admin/updates.md` — extend with Tier 2 docs (Apply flow, settings, supervisor requirement). + +### Modified files +- `src/node/updater/types.ts` — extend `UpdateState` with `execution: ExecutionState`, `bootCount: number`, `lastResult`. Add discriminated `ExecutionStatus` union covering all states from the spec's state machine. +- `src/node/updater/state.ts` — extend the `isValid` validator to cover the new fields; backfill defaults during load so state files written by PR 1 still load. +- `src/node/updater/UpdatePolicy.ts` — extend `evaluatePolicy` so `canManual` returns false in `rollback-failed`-equivalent terminal states only when `purpose === 'auto'`; manual remains permitted (admin clicking Apply *is* the intervention). Add `purpose: 'manual' | 'auto'` to the input. +- `src/node/updater/index.ts` — call RollbackHandler.checkPendingVerification at boot before VersionChecker starts; expose getters needed by routes. +- `src/node/utils/Settings.ts` — add `updates.preApplyGraceMinutes` (default 0 in PR 2; tier 3 makes it meaningful), `updates.drainSeconds` (default 60), `updates.rollbackHealthCheckSeconds` (default 60), `updates.diskSpaceMinMB` (default 500), `updates.requireSignature` (default false), `updates.trustedKeysPath` (default null). +- `settings.json.template`, `settings.json.docker` — add the new `updates.*` keys with shipped defaults and a comment block. +- `src/static/js/pad_utils.js` (or the COLLABROOM message handler) — recognise a new `shoutMessage` subtype `update-drain` so the drain notice has its own translatable string and CSS hook (the spec calls this a "system message at T-60/T-30/T-10"; we route it through the existing shout pipeline). +- `src/locales/en.json` — add `update.page.apply`, `update.page.cancel`, `update.page.acknowledge`, `update.page.log`, `update.page.execution`, `update.page.policy.*`, `update.page.last_result.*`, `update.execution.*`, `update.banner.terminal.rollback-failed`, `update.drain.t60`, `update.drain.t30`, `update.drain.t10`. +- `admin/src/store/store.ts` — extend `UpdateStatusPayload` with `execution`, `bootCount`, `lastResult` to match server shape; add `setUpdateLog` slice. +- `admin/src/pages/UpdatePage.tsx` — Apply / Cancel / Acknowledge buttons (gated on `policy.canManual`), polling log view while `execution.status === 'executing' | 'draining'`, terminal-state copy + Acknowledge button. +- `admin/src/components/UpdateBanner.tsx` — surface terminal states (`rollback-failed`, `preflight-failed`, `rolled-back-*`) with stronger copy. +- `CHANGELOG.md` — Unreleased section entry. + +--- + +## Conventions + +- **Test runners:** unit specs go under `src/tests/backend-new/specs/updater/*.test.ts` and run with vitest (`pnpm vitest run path/to/file`). Integration/API specs go under `src/tests/backend/specs/*.ts` and run with mocha via `pnpm run test --runInBand` or `pnpm run test -- --grep `. +- **TDD loop:** write the failing test, run it, see the expected failure mode, write the minimum code to pass, run again, commit. +- **Commits:** one per task. Conventional Commits style. The footer used elsewhere on this branch is `Co-Authored-By: Claude Opus 4.7 (1M context) `. +- **No new "etherpad-lite" references** — the project is now "etherpad" in user-facing strings, docs, and configs (memory: `feedback_no_etherpad_lite_name`). +- **Always i18n** — never hardcode user-facing English (memory: `feedback_always_i18n`). Use existing keys when possible. +- **Working tree:** before starting, switch to a fresh branch off `develop`. Never push to `develop` or `main` directly (memory: `feedback_no_direct_push`). + +--- + +## Task 0: Branch off develop + +**Files:** none (git only). + +- [ ] **Step 1: Stash anything dirty, switch to develop, pull, branch off** + +```bash +git stash push -u -m "wip-7696-popup-scroll" || true +git fetch origin +git checkout develop +git pull --ff-only origin develop +git checkout -b feat/7607-auto-update-tier2-manual-click +``` + +Expected: branch `feat/7607-auto-update-tier2-manual-click` based on latest `origin/develop`. + +- [ ] **Step 2: Confirm Tier 1 surface still passes** + +Run: `pnpm run ts-check && pnpm vitest run src/tests/backend-new/specs/updater` +Expected: PASS (we are baselining before adding code). + +--- + +## Task 1: Extend types + state validator + settings for Tier 2 + +**Files:** +- Modify: `src/node/updater/types.ts` +- Modify: `src/node/updater/state.ts` +- Modify: `src/node/utils/Settings.ts` +- Modify: `settings.json.template` +- Modify: `settings.json.docker` +- Test: `src/tests/backend-new/specs/updater/state.test.ts` (existing — extend) + +- [ ] **Step 1: Add a failing test for the extended state shape** + +Append to `src/tests/backend-new/specs/updater/state.test.ts` inside its existing `describe`: + +```typescript +import {EMPTY_STATE} from '../../../../node/updater/types'; + +describe('Tier 2 state extensions', () => { + it('EMPTY_STATE carries an idle execution block, bootCount 0, no lastResult', () => { + expect(EMPTY_STATE.execution).toEqual({status: 'idle'}); + expect(EMPTY_STATE.bootCount).toBe(0); + expect(EMPTY_STATE.lastResult).toBeNull(); + }); + + it('loadState backfills missing Tier 2 fields on a Tier 1 file', async () => { + const tmp = path.join(os.tmpdir(), `state-${Date.now()}.json`); + await fs.writeFile(tmp, JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + })); + const state = await loadState(tmp); + expect(state.execution).toEqual({status: 'idle'}); + expect(state.bootCount).toBe(0); + expect(state.lastResult).toBeNull(); + await fs.unlink(tmp); + }); + + it('rejects a malformed execution block by resetting to EMPTY_STATE', async () => { + const tmp = path.join(os.tmpdir(), `state-${Date.now()}.json`); + await fs.writeFile(tmp, JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: 'not-an-object', + })); + const state = await loadState(tmp); + expect(state).toEqual(EMPTY_STATE); + await fs.unlink(tmp); + }); +}); +``` + +(Add `import os from 'node:os'` and `import fs from 'node:fs/promises'` at the top of the file if not present.) + +- [ ] **Step 2: Run the test to confirm it fails** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/state.test.ts` +Expected: FAIL on `EMPTY_STATE.execution` being undefined. + +- [ ] **Step 3: Extend `types.ts`** + +Replace the bottom of `src/node/updater/types.ts` (`UpdateState` interface and `EMPTY_STATE`) with: + +```typescript +/** + * Discriminated union mirroring the state machine in + * docs/superpowers/specs/2026-04-25-auto-update-design.md (section "State machine"). + * + * Terminal states (`rollback-failed`) require an admin POST to /admin/update/acknowledge + * before further auto/autonomous attempts are allowed. Manual updates remain permitted + * because an admin clicking Apply *is* the intervention. + */ +export type ExecutionStatus = + | {status: 'idle'} + | {status: 'preflight'; targetTag: string; startedAt: string} + | {status: 'preflight-failed'; targetTag: string; reason: string; at: string} + | {status: 'draining'; targetTag: string; drainEndsAt: string; startedAt: string} + | {status: 'executing'; targetTag: string; fromSha: string; startedAt: string} + | {status: 'pending-verification'; targetTag: string; fromSha: string; deadlineAt: string} + | {status: 'verified'; targetTag: string; verifiedAt: string} + | {status: 'rolling-back'; reason: string; targetTag: string; fromSha: string; at: string} + | {status: 'rolled-back'; reason: string; targetTag: string; restoredSha: string; at: string} + | {status: 'rollback-failed'; reason: string; targetTag: string; fromSha: string; at: string}; + +export type LastUpdateResult = { + /** Tag we were updating to. */ + targetTag: string; + /** SHA we were updating from. */ + fromSha: string; + /** Outcome to surface in admin UI. */ + outcome: 'verified' | 'rolled-back' | 'rollback-failed' | 'preflight-failed' | 'cancelled'; + /** Human-readable reason on non-success. */ + reason: string | null; + /** ISO timestamp when this result was finalised. */ + at: string; +} | null; + +export interface UpdateState { + schemaVersion: 1; + lastCheckAt: string | null; + lastEtag: string | null; + latest: ReleaseInfo | null; + vulnerableBelow: VulnerableBelowDirective[]; + email: EmailSendLog; + /** Current in-flight execution state. Persisted so a restart mid-update reaches RollbackHandler. */ + execution: ExecutionStatus; + /** + * Boot counter that the RollbackHandler increments while a `pending-verification` + * status is live. > 2 means the new version crash-looped; force rollback regardless of timer. + */ + bootCount: number; + /** Most recent terminal outcome, surfaced in admin UI even after `execution` returns to idle. */ + lastResult: LastUpdateResult; +} + +export const EMPTY_STATE: UpdateState = { + schemaVersion: 1, + lastCheckAt: null, + lastEtag: null, + latest: null, + vulnerableBelow: [], + email: { + severeAt: null, + vulnerableAt: null, + vulnerableNewReleaseTag: null, + }, + execution: {status: 'idle'}, + bootCount: 0, + lastResult: null, +}; +``` + +- [ ] **Step 4: Extend `state.ts` validators** + +In `src/node/updater/state.ts`, add these helpers above `isValid` and call them from `isValid`: + +```typescript +const VALID_STATUSES = new Set([ + 'idle', 'preflight', 'preflight-failed', 'draining', 'executing', + 'pending-verification', 'verified', 'rolling-back', 'rolled-back', 'rollback-failed', +]); + +const isValidExecution = (v: unknown): boolean => { + if (!isPlainObject(v)) return false; + return typeof v.status === 'string' && VALID_STATUSES.has(v.status as string); +}; + +const isValidLastResult = (v: unknown): boolean => { + if (v === null) return true; + if (!isPlainObject(v)) return false; + return typeof v.targetTag === 'string' + && typeof v.fromSha === 'string' + && typeof v.outcome === 'string' + && (v.reason === null || typeof v.reason === 'string') + && typeof v.at === 'string'; +}; +``` + +Update `isValid` to *backfill* the new fields if missing instead of rejecting (to keep PR 1 state files loadable), and reject only when present-and-malformed: + +```typescript +const isValid = (raw: unknown): raw is UpdateState => { + if (!isPlainObject(raw)) return false; + if (raw.schemaVersion !== 1) return false; + if (!isStringOrNull(raw.lastCheckAt)) return false; + if (!isStringOrNull(raw.lastEtag)) return false; + if (!isValidLatest(raw.latest)) return false; + if (!isValidVulnerableBelow(raw.vulnerableBelow)) return false; + if (!isValidEmail(raw.email)) return false; + // PR 2 fields: missing → backfill at load time; present-but-wrong → reject. + if (raw.execution !== undefined && !isValidExecution(raw.execution)) return false; + if (raw.bootCount !== undefined && typeof raw.bootCount !== 'number') return false; + if (raw.lastResult !== undefined && !isValidLastResult(raw.lastResult)) return false; + return true; +}; +``` + +Update `loadState` to splat defaults for the new fields: + +```typescript +export const loadState = async (filePath: string): Promise => { + let raw: string; + try { + raw = await fs.readFile(filePath, 'utf8'); + } catch (err: any) { + if (err.code === 'ENOENT') return structuredClone(EMPTY_STATE); + throw err; + } + let parsed: unknown; + try { parsed = JSON.parse(raw); } catch { return structuredClone(EMPTY_STATE); } + if (!isValid(parsed)) return structuredClone(EMPTY_STATE); + // Backfill PR 2 fields on a Tier 1 state file. + return { + ...structuredClone(EMPTY_STATE), + ...(parsed as object), + execution: (parsed as any).execution ?? structuredClone(EMPTY_STATE.execution), + bootCount: (parsed as any).bootCount ?? 0, + lastResult: (parsed as any).lastResult ?? null, + }; +}; +``` + +- [ ] **Step 5: Extend `Settings.ts` typing and defaults** + +In the `SettingsType.updates` block (around line 326) add: + +```typescript + preApplyGraceMinutes: number, + drainSeconds: number, + rollbackHealthCheckSeconds: number, + diskSpaceMinMB: number, + requireSignature: boolean, + trustedKeysPath: string | null, +``` + +In the `settings: SettingsType = { ... updates: { ... } ... }` defaults (around line 506) add: + +```typescript + preApplyGraceMinutes: 0, + drainSeconds: 60, + rollbackHealthCheckSeconds: 60, + diskSpaceMinMB: 500, + requireSignature: false, + trustedKeysPath: null, +``` + +Add the same keys to `settings.json.template` and `settings.json.docker` inside their `updates` blocks. Comment in template: + +```jsonc + "updates": { + "tier": "notify", + /* ... existing keys ... */ + /* Tier 2+ knobs (only meaningful at tier "manual" or higher) */ + "preApplyGraceMinutes": 0, + "drainSeconds": 60, + "rollbackHealthCheckSeconds": 60, + "diskSpaceMinMB": 500, + /* When true, refuse updates whose tag is not signed by a trusted key. */ + "requireSignature": false, + "trustedKeysPath": null + }, +``` + +- [ ] **Step 6: Run the tests** + +```bash +pnpm vitest run src/tests/backend-new/specs/updater/state.test.ts +pnpm run ts-check +``` + +Expected: state tests PASS, ts-check clean. + +- [ ] **Step 7: Commit** + +```bash +git add src/node/updater/types.ts src/node/updater/state.ts \ + src/node/utils/Settings.ts settings.json.template settings.json.docker \ + src/tests/backend-new/specs/updater/state.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): extend state + settings for Tier 2 manual-click + +Adds ExecutionStatus discriminated union, bootCount, and lastResult to +UpdateState, plus the preApplyGraceMinutes/drainSeconds/diskSpaceMinMB/ +requireSignature/trustedKeysPath knobs that Tier 2's executor needs. +loadState backfills the new fields on Tier 1 state files so existing +installs keep working. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 2: PID-based update lock + +**Files:** +- Create: `src/node/updater/lock.ts` +- Test: `src/tests/backend-new/specs/updater/lock.test.ts` + +The lock at `var/update.lock` carries the holder's PID. A second acquire reads the file, sends signal 0 to the recorded PID; if the PID is gone (ESRCH) the lock is stale and we reap it. + +- [ ] **Step 1: Write failing test** + +Create `src/tests/backend-new/specs/updater/lock.test.ts`: + +```typescript +import {describe, it, expect, beforeEach, afterEach} from 'vitest'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import {acquireLock, releaseLock, isHeld} from '../../../../node/updater/lock'; + +describe('update lock', () => { + let dir: string; + let lockPath: string; + beforeEach(async () => { + dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-lock-')); + lockPath = path.join(dir, 'update.lock'); + }); + afterEach(async () => { + await fs.rm(dir, {recursive: true, force: true}); + }); + + it('acquires and releases', async () => { + expect(await acquireLock(lockPath)).toBe(true); + expect(await isHeld(lockPath)).toBe(true); + await releaseLock(lockPath); + expect(await isHeld(lockPath)).toBe(false); + }); + + it('rejects a second acquire while live', async () => { + expect(await acquireLock(lockPath)).toBe(true); + expect(await acquireLock(lockPath)).toBe(false); + await releaseLock(lockPath); + }); + + it('reaps a stale lock whose PID is gone', async () => { + // Write a lock claiming a PID that almost certainly does not exist. + await fs.writeFile(lockPath, JSON.stringify({pid: 2147483646, at: new Date().toISOString()})); + expect(await acquireLock(lockPath)).toBe(true); + await releaseLock(lockPath); + }); + + it('treats an unparseable lock file as stale', async () => { + await fs.writeFile(lockPath, 'garbage'); + expect(await acquireLock(lockPath)).toBe(true); + await releaseLock(lockPath); + }); +}); +``` + +- [ ] **Step 2: Run — expect fail (module missing)** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/lock.test.ts` +Expected: FAIL with import error. + +- [ ] **Step 3: Implement lock** + +Create `src/node/updater/lock.ts`: + +```typescript +import fs from 'node:fs/promises'; +import path from 'node:path'; + +interface LockFile {pid: number; at: string} + +const isPidLive = (pid: number): boolean => { + try { + process.kill(pid, 0); + return true; + } catch (err: any) { + // ESRCH = no such process (stale). EPERM = exists but we can't signal — treat as live. + return err.code !== 'ESRCH'; + } +}; + +const readIfPresent = async (lockPath: string): Promise => { + let raw: string; + try { raw = await fs.readFile(lockPath, 'utf8'); } + catch (err: any) { return err.code === 'ENOENT' ? null : null; } + try { + const parsed = JSON.parse(raw); + if (typeof parsed?.pid !== 'number' || typeof parsed?.at !== 'string') return null; + return parsed; + } catch { return null; } +}; + +/** + * Atomic acquire via O_CREAT|O_EXCL. If the file already exists, the holder's PID + * is checked; if dead, we reap and retry once. Returns false on a live conflict. + */ +export const acquireLock = async (lockPath: string): Promise => { + await fs.mkdir(path.dirname(lockPath), {recursive: true}); + const payload = JSON.stringify({pid: process.pid, at: new Date().toISOString()}); + try { + const fh = await fs.open(lockPath, 'wx'); + try { await fh.writeFile(payload); } finally { await fh.close(); } + return true; + } catch (err: any) { + if (err.code !== 'EEXIST') throw err; + } + const existing = await readIfPresent(lockPath); + if (existing && isPidLive(existing.pid)) return false; + // Stale — unlink and retry once. A concurrent reaper may beat us, so EEXIST is also "no". + try { await fs.unlink(lockPath); } catch (err: any) { if (err.code !== 'ENOENT') throw err; } + try { + const fh = await fs.open(lockPath, 'wx'); + try { await fh.writeFile(payload); } finally { await fh.close(); } + return true; + } catch (err: any) { + if (err.code === 'EEXIST') return false; + throw err; + } +}; + +export const releaseLock = async (lockPath: string): Promise => { + try { await fs.unlink(lockPath); } + catch (err: any) { if (err.code !== 'ENOENT') throw err; } +}; + +export const isHeld = async (lockPath: string): Promise => { + const f = await readIfPresent(lockPath); + return !!f && isPidLive(f.pid); +}; +``` + +- [ ] **Step 4: Run — expect pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/lock.test.ts` +Expected: PASS (4 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/lock.ts src/tests/backend-new/specs/updater/lock.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): PID-based update.lock with stale-pid reaping + +Single-flight guard for Tier 2's UpdateExecutor. Atomic O_CREAT|O_EXCL +acquire; on EEXIST, sends signal 0 to the recorded PID and reaps if dead. +Unparseable lock files are treated as stale rather than fatal so a +half-written lock from a SIGKILL'd parent doesn't lock the install out. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 3: Trusted-keys / signature verification stub + +**Files:** +- Create: `src/node/updater/trustedKeys.ts` +- Test: `src/tests/backend-new/specs/updater/trustedKeys.test.ts` + +We ship a feature-flagged signature verifier. With `updates.requireSignature: false` (default) we log a one-line warning and return `ok`. With `requireSignature: true` we shell out to `git verify-tag ` and require exit 0; the trusted set is whatever keys are imported into the Etherpad user's GnuPG keyring (or a custom keyring at `updates.trustedKeysPath` — passed to git via `GNUPGHOME`). Real key-rotation policy is documented as follow-up; this gives admins who care a working knob today. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/trustedKeys.test.ts`: + +```typescript +import {describe, it, expect, vi} from 'vitest'; +import {verifyReleaseTag} from '../../../../node/updater/trustedKeys'; + +describe('verifyReleaseTag', () => { + it('returns ok when requireSignature is false (no spawn)', async () => { + const spawnFn = vi.fn(); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', repoDir: '/tmp/x', requireSignature: false, + trustedKeysPath: null, spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: true, reason: 'signature-not-required'}); + expect(spawnFn).not.toHaveBeenCalled(); + }); + + it('returns ok on git verify-tag exit 0', async () => { + const spawnFn = vi.fn(() => ({on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(0), 0)})); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', repoDir: '/tmp/x', requireSignature: true, + trustedKeysPath: null, spawnFn: spawnFn as any, + }); + expect(r.ok).toBe(true); + expect(spawnFn).toHaveBeenCalledWith( + 'git', + ['verify-tag', 'v2.7.3'], + expect.objectContaining({cwd: '/tmp/x'}), + ); + }); + + it('returns failure on non-zero exit', async () => { + const spawnFn = vi.fn(() => ({on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(1), 0)})); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', repoDir: '/tmp/x', requireSignature: true, + trustedKeysPath: null, spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); + }); + + it('passes GNUPGHOME when trustedKeysPath is set', async () => { + const calls: any[] = []; + const spawnFn = vi.fn((cmd: string, args: string[], opts: any) => { + calls.push({cmd, args, env: opts.env}); + return {on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(0), 0)} as any; + }); + await verifyReleaseTag({ + tag: 'v2.7.3', repoDir: '/tmp/x', requireSignature: true, + trustedKeysPath: '/srv/etherpad/keys', spawnFn: spawnFn as any, + }); + expect(calls[0].env.GNUPGHOME).toBe('/srv/etherpad/keys'); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/trustedKeys.test.ts` +Expected: FAIL (module missing). + +- [ ] **Step 3: Implement** + +Create `src/node/updater/trustedKeys.ts`: + +```typescript +import {spawn as realSpawn, SpawnOptions} from 'node:child_process'; +import log4js from 'log4js'; + +const logger = log4js.getLogger('updater'); + +export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { + on: (event: 'close', cb: (code: number | null) => void) => void; +}; + +export interface VerifyArgs { + tag: string; + repoDir: string; + requireSignature: boolean; + trustedKeysPath: string | null; + spawnFn?: SpawnFn; +} + +export type VerifyResult = + | {ok: true; reason: 'signature-verified' | 'signature-not-required'} + | {ok: false; reason: 'signature-verification-failed'}; + +/** + * Verify a release tag's GPG signature. With requireSignature=false (default) + * this is a documented no-op — Etherpad's release process does not yet sign + * tags consistently and forcing verification on by default would break Tier 2 + * for everyone. Admins who manage their own builds set requireSignature=true + * and import their trusted keys into the Etherpad user's keyring (or a + * dedicated one via trustedKeysPath -> $GNUPGHOME). + */ +export const verifyReleaseTag = async (args: VerifyArgs): Promise => { + if (!args.requireSignature) { + logger.warn(`verifyReleaseTag: signature check skipped (updates.requireSignature=false) for ${args.tag}`); + return {ok: true, reason: 'signature-not-required'}; + } + const spawnFn = args.spawnFn ?? (realSpawn as unknown as SpawnFn); + const env: NodeJS.ProcessEnv = {...process.env}; + if (args.trustedKeysPath) env.GNUPGHOME = args.trustedKeysPath; + const child = spawnFn('git', ['verify-tag', args.tag], {cwd: args.repoDir, env, stdio: 'ignore'}); + const code: number | null = await new Promise((resolve) => child.on('close', resolve)); + if (code === 0) return {ok: true, reason: 'signature-verified'}; + return {ok: false, reason: 'signature-verification-failed'}; +}; +``` + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/trustedKeys.test.ts` +Expected: PASS (4 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/trustedKeys.ts src/tests/backend-new/specs/updater/trustedKeys.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): verifyReleaseTag — gpg-via-git stub for Tier 2 preflight + +Default updates.requireSignature=false: log a warning and return ok. +Set true to make preflight refuse a tag whose signature does not verify +under the system keyring (or trustedKeysPath via GNUPGHOME). Etherpad's +release process does not yet sign tags consistently; turning the check +on by default would break Tier 2 for every admin and forcing a release- +signing change is out of scope for this PR. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 4: Pre-flight checks + +**Files:** +- Create: `src/node/updater/preflight.ts` +- Test: `src/tests/backend-new/specs/updater/preflight.test.ts` + +The `runPreflight` function takes everything it needs as injected dependencies — no direct fs/spawn — so unit tests can stub each individual check. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/preflight.test.ts`: + +```typescript +import {describe, it, expect, vi} from 'vitest'; +import {runPreflight} from '../../../../node/updater/preflight'; + +const baseDeps = { + installMethod: 'git' as const, + workingTreeClean: vi.fn(async () => true), + freeDiskMB: vi.fn(async () => 5000), + pnpmOnPath: vi.fn(async () => true), + lockHeld: vi.fn(async () => false), + remoteHasTag: vi.fn(async () => true), + verifyTag: vi.fn(async () => ({ok: true as const, reason: 'signature-not-required' as const})), +}; + +const baseInput = { + targetTag: 'v2.7.3', + diskSpaceMinMB: 500, + requireSignature: false, + trustedKeysPath: null, +}; + +describe('runPreflight', () => { + it('passes when all checks pass', async () => { + const r = await runPreflight(baseInput, {...baseDeps}); + expect(r).toEqual({ok: true}); + }); + + it('rejects non-writable install methods', async () => { + const r = await runPreflight(baseInput, {...baseDeps, installMethod: 'docker'}); + expect(r).toEqual({ok: false, reason: 'install-method-not-writable'}); + }); + + it('rejects a dirty working tree', async () => { + const r = await runPreflight(baseInput, {...baseDeps, workingTreeClean: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'dirty-working-tree'}); + }); + + it('rejects insufficient disk space', async () => { + const r = await runPreflight(baseInput, {...baseDeps, freeDiskMB: vi.fn(async () => 100)}); + expect(r).toEqual({ok: false, reason: 'low-disk-space'}); + }); + + it('rejects when pnpm is missing', async () => { + const r = await runPreflight(baseInput, {...baseDeps, pnpmOnPath: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'pnpm-not-found'}); + }); + + it('rejects when the lock is held', async () => { + const r = await runPreflight(baseInput, {...baseDeps, lockHeld: vi.fn(async () => true)}); + expect(r).toEqual({ok: false, reason: 'lock-held'}); + }); + + it('rejects when the remote tag is missing', async () => { + const r = await runPreflight(baseInput, {...baseDeps, remoteHasTag: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'remote-tag-missing'}); + }); + + it('rejects when signature verification fails', async () => { + const r = await runPreflight(baseInput, { + ...baseDeps, + verifyTag: vi.fn(async () => ({ok: false as const, reason: 'signature-verification-failed' as const})), + }); + expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/preflight.test.ts` +Expected: FAIL. + +- [ ] **Step 3: Implement** + +Create `src/node/updater/preflight.ts`: + +```typescript +import {InstallMethod} from './types'; +import type {VerifyResult} from './trustedKeys'; + +export type PreflightReason = + | 'install-method-not-writable' + | 'dirty-working-tree' + | 'low-disk-space' + | 'pnpm-not-found' + | 'lock-held' + | 'remote-tag-missing' + | 'signature-verification-failed'; + +export interface PreflightInput { + targetTag: string; + diskSpaceMinMB: number; + requireSignature: boolean; + trustedKeysPath: string | null; +} + +export interface PreflightDeps { + installMethod: Exclude; + workingTreeClean: () => Promise; + freeDiskMB: () => Promise; + pnpmOnPath: () => Promise; + lockHeld: () => Promise; + remoteHasTag: (tag: string) => Promise; + verifyTag: () => Promise; +} + +export type PreflightResult = {ok: true} | {ok: false; reason: PreflightReason}; + +const WRITABLE: ReadonlySet> = new Set(['git']); + +/** + * Sequenced preflight: each check is fast and reads the world. Order matters — + * cheap, definitive failures (install method) run before slow ones (network tag + * lookup, gpg). The first failure short-circuits. + */ +export const runPreflight = async ( + input: PreflightInput, + deps: PreflightDeps, +): Promise => { + if (!WRITABLE.has(deps.installMethod)) return {ok: false, reason: 'install-method-not-writable'}; + if (!await deps.workingTreeClean()) return {ok: false, reason: 'dirty-working-tree'}; + if ((await deps.freeDiskMB()) < input.diskSpaceMinMB) return {ok: false, reason: 'low-disk-space'}; + if (!await deps.pnpmOnPath()) return {ok: false, reason: 'pnpm-not-found'}; + if (await deps.lockHeld()) return {ok: false, reason: 'lock-held'}; + if (!await deps.remoteHasTag(input.targetTag)) return {ok: false, reason: 'remote-tag-missing'}; + const sig = await deps.verifyTag(); + if (!sig.ok) return {ok: false, reason: 'signature-verification-failed'}; + return {ok: true}; +}; +``` + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/preflight.test.ts` +Expected: PASS (8 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/preflight.ts src/tests/backend-new/specs/updater/preflight.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): preflight check pipeline for Tier 2 + +Pure orchestrator over injected probes for install-method, working tree, +disk space, pnpm presence, lock state, remote tag existence and signature +verification. Cheap-and-definitive checks run first; first failure short- +circuits with a typed reason that the route layer will surface in the +preflight-failed admin banner. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 5: Update log appender + tail + +**Files:** +- Create: `src/node/updater/updateLog.ts` +- Test: `src/tests/backend-new/specs/updater/updateLog.test.ts` + +A dedicated log4js logger writes to `var/log/update.log` with a 10 MB × 5 rolling-file appender. `tailLines(n)` reads the most recent `n` lines from the active log file for the `/admin/update/log` endpoint. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/updateLog.test.ts`: + +```typescript +import {describe, it, expect, beforeEach, afterEach} from 'vitest'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import {tailLines} from '../../../../node/updater/updateLog'; + +describe('tailLines', () => { + let dir: string; + let logPath: string; + beforeEach(async () => { + dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-log-')); + logPath = path.join(dir, 'update.log'); + }); + afterEach(async () => { await fs.rm(dir, {recursive: true, force: true}); }); + + it('returns [] when file is missing', async () => { + expect(await tailLines(logPath, 10)).toEqual([]); + }); + + it('returns up to N lines when file is shorter', async () => { + await fs.writeFile(logPath, 'a\nb\nc\n'); + expect(await tailLines(logPath, 10)).toEqual(['a', 'b', 'c']); + }); + + it('returns the last N when file is longer', async () => { + const lines = Array.from({length: 500}, (_, i) => `line-${i}`); + await fs.writeFile(logPath, lines.join('\n') + '\n'); + expect(await tailLines(logPath, 5)).toEqual(['line-495', 'line-496', 'line-497', 'line-498', 'line-499']); + }); + + it('handles a final-line-without-newline', async () => { + await fs.writeFile(logPath, 'a\nb\nc'); + expect(await tailLines(logPath, 10)).toEqual(['a', 'b', 'c']); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/updateLog.test.ts` +Expected: FAIL. + +- [ ] **Step 3: Implement** + +Create `src/node/updater/updateLog.ts`: + +```typescript +import fs from 'node:fs/promises'; +import path from 'node:path'; +import log4js from 'log4js'; + +let configured = false; + +/** Idempotently register a rolling-file appender for the updater log. */ +export const ensureUpdateLogAppender = (logPath: string): void => { + if (configured) return; + const dir = path.dirname(logPath); + // mkdir is sync-best-effort: log4js will surface any deeper failure on first write. + try { require('node:fs').mkdirSync(dir, {recursive: true}); } catch {/* noop */} + const cfg: any = log4js.getConfig?.() ?? null; + // We don't try to mutate an arbitrary external log4js config — we just add our category. + log4js.addLayout?.('json', () => (e: any) => JSON.stringify({t: e.startTime, lvl: e.level.levelStr, m: e.data.join(' ')})); + log4js.configure({ + appenders: { + ...(cfg?.appenders || {}), + updateLog: {type: 'file', filename: logPath, maxLogSize: 10 * 1024 * 1024, backups: 5, compress: false}, + }, + categories: { + ...(cfg?.categories || {default: {appenders: ['out'], level: 'info'}}), + updater: {appenders: ['updateLog'], level: 'info'}, + }, + }); + configured = true; +}; + +/** Read the last `n` newline-separated lines from the active log file. Empty array if missing. */ +export const tailLines = async (logPath: string, n: number): Promise => { + let raw: string; + try { raw = await fs.readFile(logPath, 'utf8'); } + catch (err: any) { if (err.code === 'ENOENT') return []; throw err; } + const stripped = raw.endsWith('\n') ? raw.slice(0, -1) : raw; + if (stripped.length === 0) return []; + const all = stripped.split('\n'); + return all.slice(Math.max(0, all.length - n)); +}; +``` + +> **Note on `log4js.configure`:** Etherpad's main entrypoint already calls `log4js.configure` once. Calling it again replaces the config. The `cfg = log4js.getConfig?.()` spread above preserves the existing appenders and categories so we only *add* `updateLog` and the `updater` category. If `getConfig` isn't exposed in the runtime version of log4js, the fallback writes both `default` and `updater` so existing log lines still go somewhere — verify behaviour with the smoke test below. + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/updateLog.test.ts` +Expected: PASS (4 tests). + +- [ ] **Step 5: Smoke-test the appender against the real boot path** + +Run: `pnpm run dev -- --port 9003 &` (start in background) then `tail -n 20 var/log/etherpad.log`. Confirm normal logs still appear, then `curl -fsSL http://localhost:9003/health` and verify the existing `default` appender output is unchanged. Stop with `kill %1`. + +If existing logs disappear, the spread of `cfg.appenders/categories` did not preserve them — adjust `ensureUpdateLogAppender` to use the appender registration API rather than `configure`. (Concretely: many log4js builds support `log4js.recording()` or one can keep a reference to the original config from `Settings.ts`'s `log4js.configure(...)` call and re-apply it merged. If the `getConfig?` path returns `null`, fall back to copying the layout from `settings.logconfig` which is what `Settings.ts` builds.) + +- [ ] **Step 6: Commit** + +```bash +git add src/node/updater/updateLog.ts src/tests/backend-new/specs/updater/updateLog.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): rolling update.log appender + tailLines helper + +ensureUpdateLogAppender adds a 10MB x 5 rolling-file appender for the +'updater' log4js category at var/log/update.log; tailLines reads the +last N lines for the /admin/update/log streaming endpoint without +loading the whole file into memory if a partial read suffices. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 6: SessionDrainer + +**Files:** +- Create: `src/node/updater/SessionDrainer.ts` +- Test: `src/tests/backend-new/specs/updater/SessionDrainer.test.ts` + +The drainer schedules three broadcasts (T-60, T-30, T-10), flips a module-level "no new connections" flag, and resolves a promise at T=0. The flag is read by a lightweight check we'll add to PadMessageHandler in this same task. Tests use fake timers and a stubbed broadcaster. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/SessionDrainer.test.ts`: + +```typescript +import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest'; +import {createDrainer, isAcceptingConnections, _resetForTests} from '../../../../node/updater/SessionDrainer'; + +describe('SessionDrainer', () => { + beforeEach(() => { vi.useFakeTimers(); _resetForTests(); }); + afterEach(() => { vi.useRealTimers(); _resetForTests(); }); + + it('emits T-60, T-30, T-10 and resolves at T=0', async () => { + const broadcasts: Array<{at: number; key: string}> = []; + const drainer = createDrainer({ + drainSeconds: 60, + broadcast: (key, _values) => { broadcasts.push({at: Date.now(), key}); }, + }); + const start = Date.now(); + const done = drainer.start(); + // T-60 broadcast fires immediately on start. + expect(broadcasts.map((b) => b.key)).toEqual(['update.drain.t60']); + await vi.advanceTimersByTimeAsync(30_000); + expect(broadcasts.map((b) => b.key)).toEqual(['update.drain.t60', 'update.drain.t30']); + await vi.advanceTimersByTimeAsync(20_000); + expect(broadcasts.map((b) => b.key)).toEqual([ + 'update.drain.t60', 'update.drain.t30', 'update.drain.t10', + ]); + await vi.advanceTimersByTimeAsync(10_000); + await done; + expect(Date.now() - start).toBe(60_000); + }); + + it('flips isAcceptingConnections to false during drain and back on cancel', () => { + const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); + expect(isAcceptingConnections()).toBe(true); + drainer.start(); + expect(isAcceptingConnections()).toBe(false); + drainer.cancel(); + expect(isAcceptingConnections()).toBe(true); + }); + + it('cancel before T=0 resolves the start() promise as cancelled', async () => { + const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); + const done = drainer.start(); + await vi.advanceTimersByTimeAsync(20_000); + drainer.cancel(); + const r = await done; + expect(r).toEqual({outcome: 'cancelled'}); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/SessionDrainer.test.ts` +Expected: FAIL. + +- [ ] **Step 3: Implement** + +Create `src/node/updater/SessionDrainer.ts`: + +```typescript +let acceptingConnections = true; + +export const isAcceptingConnections = (): boolean => acceptingConnections; +export const _resetForTests = (): void => { acceptingConnections = true; }; + +export interface DrainerOpts { + drainSeconds: number; + /** Called for every broadcast; the i18n key is fixed but `values` may carry timing data. */ + broadcast: (i18nKey: 'update.drain.t60' | 'update.drain.t30' | 'update.drain.t10', values: Record) => void; +} + +export interface Drainer { + start: () => Promise<{outcome: 'completed' | 'cancelled'}>; + cancel: () => void; +} + +export const createDrainer = ({drainSeconds, broadcast}: DrainerOpts): Drainer => { + const timers: NodeJS.Timeout[] = []; + let resolveDone: ((r: {outcome: 'completed' | 'cancelled'}) => void) | null = null; + let cancelled = false; + + const fire = (k: 'update.drain.t60' | 'update.drain.t30' | 'update.drain.t10', secondsRemaining: number) => { + if (cancelled) return; + broadcast(k, {seconds: secondsRemaining}); + }; + + const start = (): Promise<{outcome: 'completed' | 'cancelled'}> => { + if (resolveDone) return Promise.reject(new Error('drainer already started')); + acceptingConnections = false; + return new Promise((resolve) => { + resolveDone = resolve; + const ms = drainSeconds * 1000; + // T-60 broadcast fires at start; T-30 and T-10 at offsets. + fire('update.drain.t60', drainSeconds); + timers.push(setTimeout(() => fire('update.drain.t30', 30), Math.max(0, ms - 30_000))); + timers.push(setTimeout(() => fire('update.drain.t10', 10), Math.max(0, ms - 10_000))); + timers.push(setTimeout(() => { + if (cancelled) return; + acceptingConnections = true; // executor takes over from here; flag goes back on after exit/restart anyway + resolveDone?.({outcome: 'completed'}); + resolveDone = null; + }, ms)); + }); + }; + + const cancel = (): void => { + if (cancelled) return; + cancelled = true; + for (const t of timers) clearTimeout(t); + timers.length = 0; + acceptingConnections = true; + resolveDone?.({outcome: 'cancelled'}); + resolveDone = null; + }; + + return {start, cancel}; +}; +``` + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/SessionDrainer.test.ts` +Expected: PASS (3 tests). + +- [ ] **Step 5: Wire `isAcceptingConnections` into the socket handshake** + +In `src/node/handler/PadMessageHandler.ts`, near the top of `handleMessage` (or wherever new socket connections enter the pad-message pipeline — pick the function that runs on every incoming socket and short-circuits before the Pad lookup), add: + +```typescript +import {isAcceptingConnections} from '../updater/SessionDrainer'; + +// ...inside the connection-accept path, before any expensive work: +if (!isAcceptingConnections()) { + socket.json.send({disconnect: 'updateInProgress'}); + socket.disconnect(true); + return; +} +``` + +Locate the existing connection-accept path with: `grep -nE "handleMessage|handleClientReady" src/node/handler/PadMessageHandler.ts | head`. Place the guard inside `handleClientReady` before the Pad is fetched. + +- [ ] **Step 6: Add a regression test for the guard** + +Create `src/tests/backend-new/specs/updater/drainer-handshake.test.ts`: + +```typescript +import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; + +describe('PadMessageHandler refuses connections during drain', () => { + beforeEach(() => { vi.resetModules(); }); + afterEach(() => { vi.resetModules(); }); + + it('handleClientReady disconnects when isAcceptingConnections is false', async () => { + vi.doMock('../../../../node/updater/SessionDrainer', () => ({ + isAcceptingConnections: () => false, + })); + const PadMessageHandler = await import('../../../../node/handler/PadMessageHandler'); + const sent: any[] = []; + let disconnected = false; + const fakeSocket: any = { + id: 'sock-1', + json: {send: (m: unknown) => sent.push(m)}, + disconnect: () => { disconnected = true; }, + conn: {request: {}}, + }; + // handleClientReady takes (socket, message); message can be a stub. + if (typeof (PadMessageHandler as any).handleClientReady === 'function') { + await (PadMessageHandler as any).handleClientReady(fakeSocket, {padId: 'doesntmatter'}); + } else { + // Fallback to handleMessage if handleClientReady is private. + await (PadMessageHandler as any).handleMessage(fakeSocket, {type: 'CLIENT_READY', padId: 'doesntmatter'}); + } + expect(disconnected).toBe(true); + expect(sent[0]).toEqual({disconnect: 'updateInProgress'}); + }); +}); +``` + +- [ ] **Step 7: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/` +Expected: all updater unit tests PASS. + +- [ ] **Step 8: Commit** + +```bash +git add src/node/updater/SessionDrainer.ts src/node/handler/PadMessageHandler.ts \ + src/tests/backend-new/specs/updater/SessionDrainer.test.ts \ + src/tests/backend-new/specs/updater/drainer-handshake.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): SessionDrainer + handshake guard + +Drainer schedules T-60/-30/-10 shoutMessage broadcasts and resolves at T=0; +PadMessageHandler short-circuits new CLIENT_READY messages while the +drainer's flag is off, so admins applying an update don't get a stampede +of fresh sockets between the broadcast and exit 75. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 7: UpdateExecutor + +**Files:** +- Create: `src/node/updater/UpdateExecutor.ts` +- Test: `src/tests/backend-new/specs/updater/UpdateExecutor.test.ts` + +The executor accepts injected `spawnFn`, `fs`, `now`, `exit`, and `saveState` so unit tests run without spawning real children or mutating the real install. It writes `state.execution` at every transition and copies `pnpm-lock.yaml` + the current SHA to `var/update-backup/` before any mutation. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/UpdateExecutor.test.ts`: + +```typescript +import {describe, it, expect, vi, beforeEach} from 'vitest'; +import {executeUpdate} from '../../../../node/updater/UpdateExecutor'; +import {EMPTY_STATE} from '../../../../node/updater/types'; + +const okSpawn = (script: Array<{cmd: string; exit: number; stderr?: string}>) => { + let i = 0; + return vi.fn((cmd: string, args: string[]) => { + const step = script[i++]; + if (!step) throw new Error(`Unexpected spawn call: ${cmd} ${args.join(' ')}`); + if (step.cmd !== `${cmd} ${args.join(' ')}`) { + throw new Error(`Spawn order mismatch: expected "${step.cmd}", got "${cmd} ${args.join(' ')}"`); + } + return { + stdout: {on: () => {}}, stderr: {on: (e: string, cb: any) => step.stderr && e === 'data' && cb(Buffer.from(step.stderr))}, + on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(step.exit), 0), + } as any; + }); +}; + +describe('executeUpdate happy path', () => { + let savedStates: any[] = []; + let written: Record = {}; + let exited: number | null = null; + + beforeEach(() => { savedStates = []; written = {}; exited = null; }); + + const baseDeps = () => ({ + repoDir: '/srv/etherpad', + backupDir: '/srv/etherpad/var/update-backup', + spawnFn: okSpawn([ + {cmd: 'git rev-parse HEAD', exit: 0}, + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 0}, + {cmd: 'pnpm run build:ui', exit: 0}, + ]), + readSha: vi.fn(async () => 'abc123'), + copyFile: vi.fn(async (_a: string, _b: string) => { written[_b] = 'lock'; }), + saveState: vi.fn(async (s: any) => { savedStates.push(structuredClone(s)); }), + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v2.7.3', + now: () => new Date('2026-05-08T10:00:00Z'), + exit: (code: number) => { exited = code; }, + }); + + it('snapshots, runs steps, persists pending-verification, exits 75', async () => { + const deps = baseDeps(); + const result = await executeUpdate(deps); + expect(result).toEqual({outcome: 'pending-verification'}); + expect(deps.copyFile).toHaveBeenCalledWith( + '/srv/etherpad/pnpm-lock.yaml', + '/srv/etherpad/var/update-backup/pnpm-lock.yaml', + ); + expect(savedStates.at(-1).execution.status).toBe('pending-verification'); + expect(savedStates.at(-1).execution.fromSha).toBe('abc123'); + expect(savedStates.at(-1).bootCount).toBe(0); + expect(exited).toBe(75); + }); + + it('install failure flips state to rolling-back', async () => { + const deps = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git rev-parse HEAD', exit: 0}, + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 1, stderr: 'resolver bork'}, + ]); + const result = await executeUpdate(deps); + expect(result.outcome).toBe('failed-install'); + expect(savedStates.at(-1).execution.status).toBe('rolling-back'); + expect(exited).toBe(null); // executor does not exit; rollback path drives the next exit + }); + + it('build failure flips state to rolling-back', async () => { + const deps = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git rev-parse HEAD', exit: 0}, + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 0}, + {cmd: 'pnpm run build:ui', exit: 2}, + ]); + const result = await executeUpdate(deps); + expect(result.outcome).toBe('failed-build'); + expect(savedStates.at(-1).execution.status).toBe('rolling-back'); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/UpdateExecutor.test.ts` +Expected: FAIL (module missing). + +- [ ] **Step 3: Implement** + +Create `src/node/updater/UpdateExecutor.ts`: + +```typescript +import path from 'node:path'; +import log4js from 'log4js'; +import {SpawnOptions} from 'node:child_process'; +import {UpdateState} from './types'; + +const logger = log4js.getLogger('updater'); + +export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { + stdout: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; + stderr: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; + on: (event: 'close', cb: (code: number | null) => void) => void; +}; + +export interface ExecutorDeps { + repoDir: string; + backupDir: string; + spawnFn: SpawnFn; + readSha: () => Promise; + copyFile: (src: string, dst: string) => Promise; + saveState: (s: UpdateState) => Promise; + initialState: UpdateState; + targetTag: string; + now: () => Date; + exit: (code: number) => void; +} + +export type ExecutorResult = + | {outcome: 'pending-verification'} + | {outcome: 'failed-install'; reason: string} + | {outcome: 'failed-build'; reason: string} + | {outcome: 'failed-checkout'; reason: string}; + +const runStep = (spawnFn: SpawnFn, repoDir: string, cmd: string, args: string[]): + Promise<{code: number | null; stderr: string}> => new Promise((resolve) => { + let stderr = ''; + const child = spawnFn(cmd, args, {cwd: repoDir, stdio: ['ignore', 'pipe', 'pipe']}); + child.stdout.on('data', (chunk: Buffer) => logger.info(`[${cmd}] ${chunk.toString().trimEnd()}`)); + child.stderr.on('data', (chunk: Buffer) => { stderr += chunk.toString(); logger.warn(`[${cmd}] ${chunk.toString().trimEnd()}`); }); + child.on('close', (code) => resolve({code, stderr})); +}); + +/** + * Run the update pipeline. Each step writes state before/after so a hard kill + * mid-step lands the next boot in a known state for RollbackHandler to resolve. + * + * On install/build failure the executor transitions to `rolling-back`, persists, + * and returns. The route layer hands control to RollbackHandler which restores + * the lockfile and SHA. The executor does NOT exit on failure paths — the + * rollback path owns that exit. + */ +export const executeUpdate = async (deps: ExecutorDeps): Promise => { + const fromSha = await deps.readSha(); + let s: UpdateState = { + ...deps.initialState, + execution: {status: 'executing', targetTag: deps.targetTag, fromSha, startedAt: deps.now().toISOString()}, + bootCount: 0, + }; + await deps.saveState(s); + + // Snapshot lockfile (SHA captured above). + await deps.copyFile(path.join(deps.repoDir, 'pnpm-lock.yaml'), path.join(deps.backupDir, 'pnpm-lock.yaml')); + + const fail = async ( + outcome: 'failed-install' | 'failed-build' | 'failed-checkout', + reason: string, + ): Promise => { + s = { + ...s, + execution: {status: 'rolling-back', reason, targetTag: deps.targetTag, fromSha, at: deps.now().toISOString()}, + }; + await deps.saveState(s); + logger.error(`update step failed (${outcome}): ${reason}`); + return {outcome, reason}; + }; + + let r = await runStep(deps.spawnFn, deps.repoDir, 'git', ['fetch', '--tags', 'origin']); + if (r.code !== 0) return fail('failed-checkout', `git fetch exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, 'git', ['checkout', deps.targetTag]); + if (r.code !== 0) return fail('failed-checkout', `git checkout exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, 'pnpm', ['install', '--frozen-lockfile']); + if (r.code !== 0) return fail('failed-install', `pnpm install exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, 'pnpm', ['run', 'build:ui']); + if (r.code !== 0) return fail('failed-build', `pnpm run build:ui exit ${r.code}: ${r.stderr.trim()}`); + + // Pending-verification: the next boot's RollbackHandler arms the health-check timer. + s = { + ...s, + execution: { + status: 'pending-verification', + targetTag: deps.targetTag, + fromSha, + // RollbackHandler computes the actual deadline at boot using rollbackHealthCheckSeconds. + // We persist a placeholder so the field is present. + deadlineAt: deps.now().toISOString(), + }, + bootCount: 0, + }; + await deps.saveState(s); + logger.info(`update executed: ${fromSha} -> ${deps.targetTag}; exiting 75 for supervisor restart`); + deps.exit(75); + return {outcome: 'pending-verification'}; +}; +``` + +> The test stubs `readSha`/`copyFile`/`saveState` because the production caller (in Task 11) provides real implementations. The executor's body never imports `node:fs` or real spawn — keeping the unit test fast and isolated. + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/UpdateExecutor.test.ts` +Expected: PASS (3 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/UpdateExecutor.ts src/tests/backend-new/specs/updater/UpdateExecutor.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): UpdateExecutor — snapshot, fetch/checkout/install/build, exit 75 + +Pure-DI orchestrator: every shell-out goes through an injected spawnFn, +every fs touch through an injected fs facade, every state write through +the saveState dependency. Unit tests cover the happy path + the install +and build failure transitions to rolling-back. The rollback path itself +lives in Task 8 (RollbackHandler); on failure the executor persists +state and returns without exiting so the route layer can run rollback. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 8: RollbackHandler + +**Files:** +- Create: `src/node/updater/RollbackHandler.ts` +- Test: `src/tests/backend-new/specs/updater/RollbackHandler.test.ts` + +Two paths: + +1. **`checkPendingVerification(state)`** runs at boot. If `state.execution.status === 'pending-verification'`, increment `bootCount`, persist, and either (a) if `bootCount > 2` force an immediate rollback, or (b) arm a 60s timer that on expiry rolls back, on success marks `verified`. Health success is signalled externally — for PR 2 we treat completion of boot through `expressCreateServer` as the success signal (RollbackHandler exposes a `markVerified()` callable). +2. **`performRollback(reason)`** runs from inside the executor's failure paths *and* from the boot-time crash-loop / health-timeout paths. It copies the backup lockfile back, runs `git checkout `, `pnpm install --frozen-lockfile`, persists `rolled-back` (or `rollback-failed` on any sub-step error), and exits 75. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/RollbackHandler.test.ts`: + +```typescript +import {describe, it, expect, vi, beforeEach} from 'vitest'; +import {checkPendingVerification, performRollback} from '../../../../node/updater/RollbackHandler'; +import {EMPTY_STATE} from '../../../../node/updater/types'; + +const baseDeps = () => ({ + repoDir: '/srv/etherpad', + backupDir: '/srv/etherpad/var/update-backup', + spawnFn: vi.fn((_c: string, _a: string[]) => ({ + stdout: {on: () => {}}, stderr: {on: () => {}}, + on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(0), 0), + })) as any, + copyFile: vi.fn(async (_a: string, _b: string) => {}), + saveState: vi.fn(async (_s: any) => {}), + exit: vi.fn((_code: number) => {}), + now: () => new Date('2026-05-08T10:00:00Z'), +}); + +describe('checkPendingVerification', () => { + beforeEach(() => { vi.useFakeTimers(); }); + + it('idle state is a no-op', async () => { + const r = checkPendingVerification(structuredClone(EMPTY_STATE), { + ...baseDeps(), rollbackHealthCheckSeconds: 60, + }); + expect(r.armed).toBe(false); + }); + + it('pending-verification with bootCount<=2 arms a timer and increments bootCount', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'pending-verification', targetTag: 'v2.7.3', fromSha: 'abc', deadlineAt: '2026-05-08T10:00:00Z'} as const, + bootCount: 0, + }; + const r = checkPendingVerification(state, {...deps, rollbackHealthCheckSeconds: 60}); + expect(r.armed).toBe(true); + // bootCount has been bumped and state persisted. + expect(deps.saveState).toHaveBeenCalledWith(expect.objectContaining({bootCount: 1})); + // markVerified clears the timer and lands on `verified`. + r.markVerified(); + await vi.advanceTimersByTimeAsync(60_000); + expect(deps.exit).not.toHaveBeenCalled(); + }); + + it('pending-verification with bootCount>2 forces immediate rollback', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'pending-verification', targetTag: 'v2.7.3', fromSha: 'abc', deadlineAt: '2026-05-08T10:00:00Z'} as const, + bootCount: 3, + }; + const r = checkPendingVerification(state, {...deps, rollbackHealthCheckSeconds: 60}); + expect(r.armed).toBe(false); + // Rollback ran; exit 75 was called once we hit the end of performRollback. + await vi.runAllTimersAsync(); + expect(deps.exit).toHaveBeenCalledWith(75); + }); + + it('timer expiry triggers rollback when markVerified is never called', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'pending-verification', targetTag: 'v2.7.3', fromSha: 'abc', deadlineAt: '2026-05-08T10:00:00Z'} as const, + bootCount: 0, + }; + const r = checkPendingVerification(state, {...deps, rollbackHealthCheckSeconds: 60}); + expect(r.armed).toBe(true); + await vi.advanceTimersByTimeAsync(60_000); + expect(deps.exit).toHaveBeenCalledWith(75); + }); +}); + +describe('performRollback', () => { + it('happy path: restores lockfile, checkout from-sha, pnpm install, exit 75, status=rolled-back', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'rolling-back', reason: 'install-failed', targetTag: 'v2.7.3', fromSha: 'abc', at: '2026-05-08T10:00:00Z'} as const, + bootCount: 0, + }; + await performRollback(state, {...deps, rollbackHealthCheckSeconds: 60}); + expect(deps.copyFile).toHaveBeenCalledWith( + '/srv/etherpad/var/update-backup/pnpm-lock.yaml', + '/srv/etherpad/pnpm-lock.yaml', + ); + expect(deps.saveState).toHaveBeenLastCalledWith(expect.objectContaining({ + execution: expect.objectContaining({status: 'rolled-back'}), + lastResult: expect.objectContaining({outcome: 'rolled-back'}), + })); + expect(deps.exit).toHaveBeenCalledWith(75); + }); + + it('rollback failure lands on rollback-failed (terminal)', async () => { + const deps = baseDeps(); + let i = 0; + deps.spawnFn = vi.fn(() => ({ + stdout: {on: () => {}}, stderr: {on: () => {}}, + on: (e: string, cb: any) => e === 'close' && setTimeout(() => cb(i++ === 0 ? 0 : 1), 0), + })) as any; + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'rolling-back', reason: 'install-failed', targetTag: 'v2.7.3', fromSha: 'abc', at: '2026-05-08T10:00:00Z'} as const, + bootCount: 0, + }; + await performRollback(state, {...deps, rollbackHealthCheckSeconds: 60}); + expect(deps.saveState).toHaveBeenLastCalledWith(expect.objectContaining({ + execution: expect.objectContaining({status: 'rollback-failed'}), + lastResult: expect.objectContaining({outcome: 'rollback-failed'}), + })); + expect(deps.exit).toHaveBeenCalledWith(75); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/RollbackHandler.test.ts` +Expected: FAIL (module missing). + +- [ ] **Step 3: Implement** + +Create `src/node/updater/RollbackHandler.ts`: + +```typescript +import path from 'node:path'; +import log4js from 'log4js'; +import {SpawnOptions} from 'node:child_process'; +import {UpdateState} from './types'; +import type {SpawnFn} from './UpdateExecutor'; + +const logger = log4js.getLogger('updater'); + +export interface RollbackDeps { + repoDir: string; + backupDir: string; + spawnFn: SpawnFn; + copyFile: (src: string, dst: string) => Promise; + saveState: (s: UpdateState) => Promise; + exit: (code: number) => void; + now: () => Date; + rollbackHealthCheckSeconds: number; +} + +const runStep = (spawnFn: SpawnFn, cwd: string, cmd: string, args: string[]): + Promise => new Promise((resolve) => { + const child = spawnFn(cmd, args, {cwd, stdio: ['ignore', 'pipe', 'pipe']}); + child.stdout.on('data', (b: Buffer) => logger.info(`[${cmd}] ${b.toString().trimEnd()}`)); + child.stderr.on('data', (b: Buffer) => logger.warn(`[${cmd}] ${b.toString().trimEnd()}`)); + child.on('close', (c) => resolve(c)); +}); + +/** Restore the previous SHA + lockfile. Lands on `rolled-back` on success, `rollback-failed` on any sub-step error. Always exits 75 so the supervisor restarts on a known state. */ +export const performRollback = async (state: UpdateState, deps: RollbackDeps): Promise => { + const exec = state.execution; + if (exec.status !== 'rolling-back' && exec.status !== 'pending-verification') { + throw new Error(`performRollback called from unexpected status: ${exec.status}`); + } + const fromSha = (exec as {fromSha: string}).fromSha; + const targetTag = (exec as {targetTag: string}).targetTag; + const reason = exec.status === 'rolling-back' ? exec.reason : 'health-check-failed-or-crash-loop'; + const failTerminal = async (subReason: string): Promise => { + const at = deps.now().toISOString(); + await deps.saveState({ + ...state, + execution: {status: 'rollback-failed', reason: `${reason}; rollback also failed: ${subReason}`, targetTag, fromSha, at}, + lastResult: {targetTag, fromSha, outcome: 'rollback-failed', reason: `${reason}; rollback failed: ${subReason}`, at}, + bootCount: 0, + }); + logger.error(`rollback FAILED: ${subReason}; manual intervention required (POST /admin/update/acknowledge after fixing)`); + deps.exit(75); + }; + + try { + await deps.copyFile(path.join(deps.backupDir, 'pnpm-lock.yaml'), path.join(deps.repoDir, 'pnpm-lock.yaml')); + } catch (err) { + return failTerminal(`copy lockfile: ${(err as Error).message}`); + } + + const checkoutCode = await runStep(deps.spawnFn, deps.repoDir, 'git', ['checkout', fromSha]); + if (checkoutCode !== 0) return failTerminal(`git checkout ${fromSha} exit ${checkoutCode}`); + + const installCode = await runStep(deps.spawnFn, deps.repoDir, 'pnpm', ['install', '--frozen-lockfile']); + if (installCode !== 0) return failTerminal(`pnpm install exit ${installCode}`); + + const at = deps.now().toISOString(); + await deps.saveState({ + ...state, + execution: {status: 'rolled-back', reason, targetTag, restoredSha: fromSha, at}, + lastResult: {targetTag, fromSha, outcome: 'rolled-back', reason, at}, + bootCount: 0, + }); + logger.warn(`rolled back to ${fromSha} (reason: ${reason})`); + deps.exit(75); +}; + +export interface CheckResult { + /** True if a health-check timer was armed and is awaiting markVerified or expiry. */ + armed: boolean; + /** Cancels the timer and transitions to `verified`. No-op when armed is false. */ + markVerified: () => void; +} + +/** + * Inspect the persisted execution state at boot and react: + * - idle / verified / etc.: no-op. + * - pending-verification with bootCount > 2: force rollback (crash-loop guard). + * - pending-verification otherwise: increment bootCount, persist, arm a timer. + */ +export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps): CheckResult => { + const exec = state.execution; + if (exec.status !== 'pending-verification') return {armed: false, markVerified: () => {}}; + + if (state.bootCount > 2) { + // Don't await — fire and forget so boot proceeds and exit happens asynchronously. + void performRollback(state, deps); + return {armed: false, markVerified: () => {}}; + } + + const incremented: UpdateState = {...state, bootCount: state.bootCount + 1}; + void deps.saveState(incremented); + + let cleared = false; + const timer = setTimeout(() => { + if (cleared) return; + void performRollback({ + ...incremented, + execution: {status: 'rolling-back', reason: 'health-check-timeout', targetTag: exec.targetTag, fromSha: exec.fromSha, at: deps.now().toISOString()}, + }, deps); + }, deps.rollbackHealthCheckSeconds * 1000); + + return { + armed: true, + markVerified: () => { + if (cleared) return; + cleared = true; + clearTimeout(timer); + const at = deps.now().toISOString(); + void deps.saveState({ + ...incremented, + execution: {status: 'verified', targetTag: exec.targetTag, verifiedAt: at}, + lastResult: {targetTag: exec.targetTag, fromSha: exec.fromSha, outcome: 'verified', reason: null, at}, + bootCount: 0, + }); + logger.info(`update verified after restart: ${exec.fromSha} -> ${exec.targetTag}`); + }, + }; +}; +``` + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/RollbackHandler.test.ts` +Expected: PASS (5 tests). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/RollbackHandler.ts src/tests/backend-new/specs/updater/RollbackHandler.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): RollbackHandler — health-check timer + crash-loop guard + +checkPendingVerification arms a 60s health-check timer at boot when state +is pending-verification, increments bootCount, and forces an immediate +rollback when bootCount>2 (crash-loop guard). performRollback restores the +lockfile and SHA, retries pnpm install, and lands on rolled-back or the +terminal rollback-failed state on sub-step failure. Both paths exit 75 so +the supervisor restarts cleanly on the new known state. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 9: Wire RollbackHandler into the boot sequence + +**Files:** +- Modify: `src/node/updater/index.ts` +- Modify: `src/node/hooks/express/updateStatus.ts` (extend status endpoint with execution + lastResult) +- Test: `src/tests/backend-new/specs/updater/index-boot.test.ts` + +Boot sequence add: after `detectInstallMethod`, before `startPolling`, run `checkPendingVerification`. Stash the returned `markVerified` so `expressCreateServer`'s success path can call it once Etherpad is `RUNNING`. + +- [ ] **Step 1: Failing test** + +Create `src/tests/backend-new/specs/updater/index-boot.test.ts`: + +```typescript +import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; + +describe('updater boot wiring', () => { + beforeEach(() => { vi.resetModules(); }); + afterEach(() => { vi.resetModules(); }); + + it('calls checkPendingVerification with the loaded state', async () => { + const calls: any[] = []; + vi.doMock('../../../../node/updater/RollbackHandler', () => ({ + checkPendingVerification: (s: any) => { calls.push(s); return {armed: false, markVerified: () => {}}; }, + performRollback: vi.fn(), + })); + vi.doMock('../../../../node/updater/InstallMethodDetector', () => ({ + detectInstallMethod: vi.fn(async () => 'git'), + })); + vi.doMock('../../../../node/updater/state', () => ({ + loadState: vi.fn(async () => ({schemaVersion: 1, execution: {status: 'idle'}, bootCount: 0, lastResult: null, + lastCheckAt: null, lastEtag: null, latest: null, vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}})), + saveState: vi.fn(async () => {}), + })); + vi.doMock('../../../../node/utils/Settings', () => ({ + default: {root: '/srv/etherpad', updates: {tier: 'manual', githubRepo: 'ether/etherpad', checkIntervalHours: 6, installMethod: 'auto', rollbackHealthCheckSeconds: 60}, adminEmail: null}, + getEpVersion: () => '2.7.2', + })); + const updater = await import('../../../../node/updater'); + await updater.expressCreateServer(); + expect(calls).toHaveLength(1); + await updater.shutdown(); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/index-boot.test.ts` +Expected: FAIL. + +- [ ] **Step 3: Wire it up** + +In `src/node/updater/index.ts`, add the import and the boot hook: + +```typescript +import {spawn} from 'node:child_process'; +import fs from 'node:fs/promises'; +import {checkPendingVerification, performRollback, CheckResult} from './RollbackHandler'; +import {ensureUpdateLogAppender} from './updateLog'; + +let pendingVerification: CheckResult | null = null; + +const rollbackDeps = () => ({ + repoDir: settings.root, + backupDir: path.join(settings.root, 'var', 'update-backup'), + spawnFn: spawn as unknown as import('./UpdateExecutor').SpawnFn, + copyFile: (src: string, dst: string) => fs.copyFile(src, dst), + saveState: (s: UpdateState) => saveState(stateFilePath(), s), + exit: (code: number) => process.exit(code), + now: () => new Date(), + rollbackHealthCheckSeconds: Number(settings.updates.rollbackHealthCheckSeconds) || 60, +}); +``` + +Replace `expressCreateServer` with: + +```typescript +export const expressCreateServer = async (): Promise => { + ensureUpdateLogAppender(path.join(settings.root, 'var', 'log', 'update.log')); + detectedMethod = await detectInstallMethod({ + override: settings.updates.installMethod, + rootDir: settings.root, + }); + logger.info(`updater: install method = ${detectedMethod}, tier = ${settings.updates.tier}`); + + const state = await getCurrentState(); + pendingVerification = checkPendingVerification(state, rollbackDeps()); + + if (settings.updates.tier !== 'off') startPolling(); +}; + +/** Called by the Etherpad runtime once the express stack is fully wired and /health is up. */ +export const markBootHealthy = (): void => { + if (pendingVerification) { + pendingVerification.markVerified(); + pendingVerification = null; + } +}; + +/** Exposed for routes. */ +export const getRollbackDeps = rollbackDeps; +export const getPendingVerification = () => pendingVerification; +``` + +In `src/node/server.ts`, after the `state = State.RUNNING` line (around line 176), add: + +```typescript +// Once the server is RUNNING, /health responds 200 — that is the implicit health +// signal the updater's pending-verification timer is waiting for. +try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + require('./updater').markBootHealthy(); +} catch (err) { + logger.debug(`markBootHealthy: ${(err as Error).message}`); +} +``` + +In `src/node/hooks/express/updateStatus.ts`, extend the `/admin/update/status` response: + +```typescript +res.json({ + currentVersion: current, + latest: state.latest, + lastCheckAt: state.lastCheckAt, + installMethod, + tier: settings.updates.tier, + policy, + vulnerableBelow: state.vulnerableBelow, + // PR 2 additions: + execution: state.execution, + lastResult: state.lastResult, + lockHeld: await import('../../updater/lock').then((m) => m.isHeld(require('node:path').join(settings.root, 'var', 'update.lock'))), +}); +``` + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/index-boot.test.ts` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/index.ts src/node/server.ts src/node/hooks/express/updateStatus.ts \ + src/tests/backend-new/specs/updater/index-boot.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): wire RollbackHandler into boot + extend /admin/update/status + +expressCreateServer now invokes checkPendingVerification before polling +starts; server.ts calls markBootHealthy after state hits RUNNING so the +60s health-check timer cancels cleanly when the new version boots fine. +The status endpoint surfaces execution + lastResult + lockHeld so the +admin UI can render Apply / Cancel / Acknowledge state correctly. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 10: Refine UpdatePolicy for terminal-state gating + +**Files:** +- Modify: `src/node/updater/UpdatePolicy.ts` +- Modify: `src/tests/backend-new/specs/updater/UpdatePolicy.test.ts` + +`canAuto` and `canAutonomous` must return false while `execution.status === 'rollback-failed'` (manual remains allowed). + +- [ ] **Step 1: Add failing tests** + +Append to `UpdatePolicy.test.ts`: + +```typescript +describe('terminal-state gating', () => { + it('rollback-failed denies auto/autonomous but allows manual', () => { + const r = evaluatePolicy({ + ...baseInput, tier: 'autonomous', + executionStatus: 'rollback-failed', + }); + expect(r.canManual).toBe(true); + expect(r.canAuto).toBe(false); + expect(r.canAutonomous).toBe(false); + expect(r.reason).toBe('rollback-failed-terminal'); + }); + + it('idle execution does not affect canManual/canAuto', () => { + const r = evaluatePolicy({...baseInput, tier: 'autonomous', executionStatus: 'idle'}); + expect(r.canManual).toBe(true); + expect(r.canAuto).toBe(true); + expect(r.canAutonomous).toBe(true); + }); +}); +``` + +- [ ] **Step 2: Run — fail** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/UpdatePolicy.test.ts` +Expected: FAIL. + +- [ ] **Step 3: Update implementation** + +In `src/node/updater/UpdatePolicy.ts`: + +```typescript +export interface PolicyInput { + installMethod: Exclude; + tier: Tier; + current: string; + latest: string; + /** Optional — when known. Only `rollback-failed` materially changes policy. */ + executionStatus?: string; +} + +export const evaluatePolicy = ({installMethod, tier, current, latest, executionStatus}: PolicyInput): PolicyResult => { + if (tier === 'off') { + return {canNotify: false, canManual: false, canAuto: false, canAutonomous: false, reason: 'tier-off'}; + } + if (compareSemver(current, latest) >= 0) { + return {canNotify: false, canManual: false, canAuto: false, canAutonomous: false, reason: 'up-to-date'}; + } + const canNotify = true; + const writable = WRITABLE_METHODS.has(installMethod); + if (!writable) { + return {canNotify, canManual: false, canAuto: false, canAutonomous: false, reason: 'install-method-not-writable'}; + } + const terminal = executionStatus === 'rollback-failed'; + return { + canNotify, + canManual: tier === 'manual' || tier === 'auto' || tier === 'autonomous', + canAuto: !terminal && (tier === 'auto' || tier === 'autonomous'), + canAutonomous: !terminal && tier === 'autonomous', + reason: terminal ? 'rollback-failed-terminal' : 'ok', + }; +}; +``` + +Also update the `updateStatus.ts` call to pass `executionStatus: state.execution.status`. + +- [ ] **Step 4: Run — pass** + +Run: `pnpm vitest run src/tests/backend-new/specs/updater/UpdatePolicy.test.ts` +Expected: PASS (existing + 2 new). + +- [ ] **Step 5: Commit** + +```bash +git add src/node/updater/UpdatePolicy.ts src/node/hooks/express/updateStatus.ts \ + src/tests/backend-new/specs/updater/UpdatePolicy.test.ts +git commit -m "$(cat <<'EOF' +feat(updater): UpdatePolicy honours rollback-failed terminal state + +canAuto/canAutonomous are denied while execution.status === 'rollback-failed'; +canManual stays on because an admin clicking Apply *is* the intervention the +terminal state requires. Status endpoint passes execution.status through so +the admin UI sees the right policy result. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 11: Apply / Cancel / Acknowledge / Log endpoints + +**Files:** +- Create: `src/node/hooks/express/updateActions.ts` +- Modify: `src/node/hooks/express/admin.ts` if a hook-registration list lives there (none required if hooks loaded via `ep.json` — see step 3) +- Modify: `src/node/updater/ep.json` (or `src/ep.json`) to register the new hook +- Test: `src/tests/backend/specs/updateActions.ts` (mocha integration) + +Strict admin auth on all four endpoints (apply, cancel, acknowledge, log) — unlike `/admin/update/status` which is read-only and intentionally loose. POST endpoints require an authenticated `is_admin` session; the GET log endpoint requires the same. + +- [ ] **Step 1: Find the right hook registration site** + +```bash +grep -nE "updateStatus|updater/index" src/node/utils/Settings.ts src/node/server.ts src/node/hooks src/ep.json src/static/js/pluginfw 2>/dev/null +cat src/ep.json +``` + +PR 1 registered `updater/index.ts:expressCreateServer` and `hooks/express/updateStatus:expressCreateServer` in `src/ep.json`. Add `hooks/express/updateActions:expressCreateServer` in the same array. + +- [ ] **Step 2: Failing test (mocha)** + +Create `src/tests/backend/specs/updateActions.ts`: + +```typescript +'use strict'; + +const assert = require('assert').strict; +const common = require('../common'); +const plugins = require('../../../static/js/pluginfw/plugin_defs'); +import settings from '../../../node/utils/Settings'; +import {saveState} from '../../../node/updater/state'; +import {EMPTY_STATE} from '../../../node/updater/types'; +import path from 'node:path'; + +const statePath = () => path.join(settings.root, 'var', 'update-state.json'); +const authHookNames = ['preAuthorize', 'authenticate', 'authorize']; +const failHookNames = ['preAuthzFailure', 'authnFailure', 'authzFailure', 'authFailure']; + +const installAdminAuth = () => { + for (const h of authHookNames.concat(failHookNames)) plugins.hooks[h] = []; + plugins.hooks.authenticate = [{ + hook_fn: (_n: string, ctx: any, cb: Function) => { + ctx.req.session.user = {is_admin: true}; + cb([true]); + }, + }]; + (settings as any).requireAuthentication = true; + (settings as any).requireAuthorization = false; + (settings as any).users = {admin: {password: 'admin-pw', is_admin: true}}; +}; + +describe(__filename, function () { + let agent: any; + const backups: Record = {}; + + before(async () => { agent = await common.init(); }); + + beforeEach(async () => { + backups.hooks = {}; + for (const n of authHookNames.concat(failHookNames)) backups.hooks[n] = plugins.hooks[n]; + backups.settings = {}; + for (const k of ['requireAuthentication', 'requireAuthorization', 'users']) backups.settings[k] = (settings as any)[k]; + await saveState(statePath(), { + ...EMPTY_STATE, + latest: {version: '99.0.0', tag: 'v99.0.0', body: 'release', publishedAt: '2099-01-01T00:00:00Z', prerelease: false, htmlUrl: 'https://example/'}, + }); + }); + + afterEach(() => { + Object.assign(plugins.hooks, backups.hooks); + Object.assign(settings, backups.settings); + }); + + describe('POST /admin/update/apply', () => { + it('rejects unauthenticated', async () => { + await agent.post('/admin/update/apply').expect(401); + }); + + it('rejects when policy denies (non-git install method)', async () => { + installAdminAuth(); + const orig = settings.updates.installMethod; + settings.updates.installMethod = 'docker'; + try { + await agent.post('/admin/update/apply').auth('admin', 'admin-pw').expect(409); + } finally { settings.updates.installMethod = orig; } + }); + + it('rejects when an execution is already in flight', async () => { + installAdminAuth(); + await saveState(statePath(), { + ...EMPTY_STATE, + latest: {version: '99.0.0', tag: 'v99.0.0', body: '', publishedAt: '', prerelease: false, htmlUrl: ''}, + execution: {status: 'executing', targetTag: 'v99.0.0', fromSha: 'x', startedAt: '2026-05-08T00:00:00Z'}, + }); + await agent.post('/admin/update/apply').auth('admin', 'admin-pw').expect(409); + }); + }); + + describe('POST /admin/update/cancel', () => { + it('rejects when nothing is running (409)', async () => { + installAdminAuth(); + await agent.post('/admin/update/cancel').auth('admin', 'admin-pw').expect(409); + }); + }); + + describe('POST /admin/update/acknowledge', () => { + it('clears a terminal state to idle', async () => { + installAdminAuth(); + await saveState(statePath(), { + ...EMPTY_STATE, + execution: {status: 'rollback-failed', reason: 'install-failed; rollback failed: pnpm exit 1', targetTag: 'v99.0.0', fromSha: 'x', at: '2026-05-08T00:00:00Z'}, + lastResult: {targetTag: 'v99.0.0', fromSha: 'x', outcome: 'rollback-failed', reason: 'pnpm install failed', at: '2026-05-08T00:00:00Z'}, + }); + await agent.post('/admin/update/acknowledge').auth('admin', 'admin-pw').expect(200); + const status = await agent.get('/admin/update/status').expect(200); + assert.equal(status.body.execution.status, 'idle'); + }); + + it('refuses to clear a non-terminal state (409)', async () => { + installAdminAuth(); + await saveState(statePath(), {...EMPTY_STATE}); + await agent.post('/admin/update/acknowledge').auth('admin', 'admin-pw').expect(409); + }); + }); + + describe('GET /admin/update/log', () => { + it('requires admin auth', async () => { + await agent.get('/admin/update/log').expect(401); + }); + + it('returns 200 with text body for an admin', async () => { + installAdminAuth(); + const res = await agent.get('/admin/update/log').auth('admin', 'admin-pw').expect(200); + assert.equal(typeof res.text, 'string'); + }); + }); +}); +``` + +- [ ] **Step 3: Implement the route module** + +Create `src/node/hooks/express/updateActions.ts`: + +```typescript +'use strict'; + +import path from 'node:path'; +import fs from 'node:fs/promises'; +import {spawn} from 'node:child_process'; +import log4js from 'log4js'; +import {ArgsExpressType} from '../../types/ArgsExpressType'; +import settings, {getEpVersion} from '../../utils/Settings'; +import {getDetectedInstallMethod, stateFilePath, getRollbackDeps} from '../../updater'; +import {evaluatePolicy} from '../../updater/UpdatePolicy'; +import {loadState, saveState} from '../../updater/state'; +import {acquireLock, releaseLock, isHeld} from '../../updater/lock'; +import {executeUpdate} from '../../updater/UpdateExecutor'; +import {createDrainer} from '../../updater/SessionDrainer'; +import {runPreflight} from '../../updater/preflight'; +import {verifyReleaseTag} from '../../updater/trustedKeys'; +import {tailLines} from '../../updater/updateLog'; +import {UpdateState} from '../../updater/types'; + +const logger = log4js.getLogger('updater'); +const lockPath = () => path.join(settings.root, 'var', 'update.lock'); +const logPath = () => path.join(settings.root, 'var', 'log', 'update.log'); +const backupDir = () => path.join(settings.root, 'var', 'update-backup'); + +let drainer: ReturnType | null = null; + +const requireAdmin = (req: any, res: any): boolean => { + const u = req.session?.user; + if (!u) { res.status(401).send('Authentication required'); return false; } + if (!u.is_admin) { res.status(403).send('Forbidden'); return false; } + return true; +}; + +const wrapAsync = (fn: (req: any, res: any, next: Function) => Promise) => + (req: any, res: any, next: Function) => Promise.resolve(fn(req, res, next)).catch(next); + +const broadcastShout = (key: string, values: Record): void => { + // Use the existing shout pipeline via socket.io. PR 1 uses io.sockets.emit('shout', ...). + // We re-import lazily to dodge a require-cycle with the socketio hook. + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const {io} = require('../socketio'); + if (!io) return; + io.sockets.emit('shout', { + type: 'COLLABROOM', + data: {type: 'shoutMessage', payload: {message: {message: key, values, sticky: false}, timestamp: Date.now()}}, + }); + } catch (err) { + logger.warn(`broadcastShout: ${(err as Error).message}`); + } +}; + +export const expressCreateServer = ( + _hookName: string, + {app}: ArgsExpressType, + cb: Function, +): void => { + if (settings.updates.tier === 'off') return cb(); + + app.post('/admin/update/apply', wrapAsync(async (req, res) => { + if (!requireAdmin(req, res)) return; + + const state = await loadState(stateFilePath()); + if (!state.latest) return res.status(409).json({error: 'no-known-latest'}); + if (state.execution.status !== 'idle' && state.execution.status !== 'verified' && + !state.execution.status.startsWith('rolled-back') && state.execution.status !== 'preflight-failed') { + return res.status(409).json({error: `execution-busy:${state.execution.status}`}); + } + + const installMethod = getDetectedInstallMethod(); + const policy = evaluatePolicy({ + installMethod, tier: settings.updates.tier, + current: getEpVersion(), latest: state.latest.version, + executionStatus: state.execution.status, + }); + if (!policy.canManual) return res.status(409).json({error: 'policy-denied', reason: policy.reason}); + + if (!await acquireLock(lockPath())) return res.status(409).json({error: 'lock-held'}); + + try { + // Preflight + const targetTag = state.latest.tag; + const startedAt = new Date().toISOString(); + const preState: UpdateState = {...state, execution: {status: 'preflight', targetTag, startedAt}}; + await saveState(stateFilePath(), preState); + + const pf = await runPreflight( + {targetTag, diskSpaceMinMB: settings.updates.diskSpaceMinMB, + requireSignature: settings.updates.requireSignature, + trustedKeysPath: settings.updates.trustedKeysPath}, + { + installMethod, + workingTreeClean: () => new Promise((resolve) => { + const c = spawn('git', ['status', '--porcelain'], {cwd: settings.root}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', () => resolve(out.trim().length === 0)); + }), + freeDiskMB: async () => { + const {statfs} = await import('node:fs/promises'); + try { + const s = await (statfs as any)(settings.root); + return Math.floor((s.bavail * s.bsize) / (1024 * 1024)); + } catch { return Number.POSITIVE_INFINITY; } // fall back to "no constraint" if statfs unsupported + }, + pnpmOnPath: () => new Promise((resolve) => { + const c = spawn('pnpm', ['--version'], {stdio: 'ignore'}); + c.on('close', (code) => resolve(code === 0)); + c.on('error', () => resolve(false)); + }), + lockHeld: async () => false, // we just acquired it + remoteHasTag: (tag) => new Promise((resolve) => { + const c = spawn('git', ['ls-remote', '--tags', 'origin', tag], {cwd: settings.root, stdio: ['ignore', 'pipe', 'ignore']}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', () => resolve(out.trim().length > 0)); + c.on('error', () => resolve(false)); + }), + verifyTag: () => verifyReleaseTag({ + tag: targetTag, repoDir: settings.root, + requireSignature: settings.updates.requireSignature, + trustedKeysPath: settings.updates.trustedKeysPath, + }), + }, + ); + + if (!pf.ok) { + const at = new Date().toISOString(); + await saveState(stateFilePath(), { + ...preState, + execution: {status: 'preflight-failed', targetTag, reason: pf.reason, at}, + lastResult: {targetTag, fromSha: '', outcome: 'preflight-failed', reason: pf.reason, at}, + }); + await releaseLock(lockPath()); + return res.status(409).json({error: 'preflight-failed', reason: pf.reason}); + } + + // Drain + drainer = createDrainer({ + drainSeconds: Number(settings.updates.drainSeconds) || 60, + broadcast: (key, values) => broadcastShout(key, values), + }); + const drainEndsAt = new Date(Date.now() + (Number(settings.updates.drainSeconds) || 60) * 1000).toISOString(); + await saveState(stateFilePath(), { + ...preState, + execution: {status: 'draining', targetTag, drainEndsAt, startedAt: new Date().toISOString()}, + }); + + // Respond before drain completes — UI polls /admin/update/status + /log. + res.status(202).json({accepted: true, drainEndsAt}); + + const drainResult = await drainer.start(); + drainer = null; + if (drainResult.outcome === 'cancelled') { + // The /admin/update/cancel handler already wrote state.execution=idle and + // lastResult=cancelled. Don't overwrite it here — just release the lock + // and return; the supervisor doesn't need to restart. + await releaseLock(lockPath()); + return; + } + + const fresh = await loadState(stateFilePath()); + await executeUpdate({ + repoDir: settings.root, + backupDir: backupDir(), + spawnFn: spawn as any, + readSha: () => new Promise((resolve, reject) => { + const c = spawn('git', ['rev-parse', 'HEAD'], {cwd: settings.root, stdio: ['ignore', 'pipe', 'ignore']}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', (code) => code === 0 ? resolve(out.trim()) : reject(new Error(`git rev-parse exit ${code}`))); + c.on('error', reject); + }), + copyFile: (src, dst) => fs.mkdir(path.dirname(dst), {recursive: true}).then(() => fs.copyFile(src, dst)), + saveState: (s) => saveState(stateFilePath(), s), + initialState: fresh, + targetTag, + now: () => new Date(), + exit: (code) => process.exit(code), + }); + // executeUpdate either calls process.exit(75) (pending-verification) or returns + // on a failure path. Failure paths are handled by the next process boot via + // RollbackHandler's pending-verification check + the rolling-back path inside performRollback. + // If we reach here, the failure path was hit and we need to perform rollback now. + const afterExec = await loadState(stateFilePath()); + if (afterExec.execution.status === 'rolling-back') { + const {performRollback} = await import('../../updater/RollbackHandler'); + await performRollback(afterExec, getRollbackDeps()); + } + await releaseLock(lockPath()); + } catch (err) { + logger.error(`apply failed: ${(err as Error).stack || err}`); + try { await releaseLock(lockPath()); } catch {/* noop */} + if (!res.headersSent) res.status(500).json({error: 'internal'}); + } + })); + + app.post('/admin/update/cancel', wrapAsync(async (req, res) => { + if (!requireAdmin(req, res)) return; + const state = await loadState(stateFilePath()); + // Cancel is allowed only during pre-execute states. Once executing begins (lockfile/SHA mutated) + // we either complete or rollback. Spec section "Error handling" / state machine. + if (state.execution.status !== 'preflight' && state.execution.status !== 'draining') { + return res.status(409).json({error: 'not-cancellable', status: state.execution.status}); + } + if (drainer) drainer.cancel(); + await saveState(stateFilePath(), {...state, execution: {status: 'idle'}, lastResult: { + targetTag: (state.execution as any).targetTag ?? '', + fromSha: '', + outcome: 'cancelled', + reason: 'admin-cancelled', + at: new Date().toISOString(), + }}); + try { await releaseLock(lockPath()); } catch {/* noop */} + res.json({cancelled: true}); + })); + + app.post('/admin/update/acknowledge', wrapAsync(async (req, res) => { + if (!requireAdmin(req, res)) return; + const state = await loadState(stateFilePath()); + const terminal = ['rollback-failed', 'preflight-failed', 'rolled-back']; + if (!terminal.some((t) => state.execution.status === t)) { + return res.status(409).json({error: 'not-terminal', status: state.execution.status}); + } + await saveState(stateFilePath(), {...state, execution: {status: 'idle'}, bootCount: 0}); + res.json({acknowledged: true}); + })); + + app.get('/admin/update/log', wrapAsync(async (req, res) => { + if (!requireAdmin(req, res)) return; + const lines = await tailLines(logPath(), 200); + res.set('Content-Type', 'text/plain; charset=utf-8'); + res.send(lines.join('\n')); + })); + + // Lock-held probe so isHeld is reachable. Status endpoint already calls this. + void isHeld; + + cb(); +}; +``` + +In `src/ep.json`, add the new hook (find the existing `expressCreateServer` block listing `updateStatus` and append): + +```json +{ + "expressCreateServer": [ + "ep_etherpad-lite/node/updater/index", + "ep_etherpad-lite/node/hooks/express/updateStatus", + "ep_etherpad-lite/node/hooks/express/updateActions" + ] +} +``` + +(Adjust the array structure to match the actual `ep.json` format — likely each hook is a separate object. Verify with `cat src/ep.json` first.) + +- [ ] **Step 4: Run — pass** + +```bash +pnpm run ts-check +pnpm run test -- --grep updateActions +``` + +Expected: TS clean, mocha PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/node/hooks/express/updateActions.ts src/ep.json src/tests/backend/specs/updateActions.ts +git commit -m "$(cat <<'EOF' +feat(updater): apply / cancel / acknowledge / log endpoints + +Strict admin-only POSTs that drive Tier 2's manual-click flow: +- /admin/update/apply: acquire lock, preflight, drain 60s, execute, exit 75 +- /admin/update/cancel: cancel a pre-execute state, release lock +- /admin/update/acknowledge: clear terminal states (preflight-failed, + rolled-back, rollback-failed) back to idle +- /admin/update/log: tail var/log/update.log for the in-progress UI + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 12: Admin UI — Apply / Cancel / Acknowledge buttons + +**Files:** +- Modify: `admin/src/pages/UpdatePage.tsx` +- Modify: `admin/src/store/store.ts` +- Modify: `src/locales/en.json` + +- [ ] **Step 1: Extend the store** + +In `admin/src/store/store.ts`, extend `UpdateStatusPayload`: + +```typescript +export type Execution = + | {status: 'idle'} + | {status: 'preflight'; targetTag: string; startedAt: string} + | {status: 'preflight-failed'; targetTag: string; reason: string; at: string} + | {status: 'draining'; targetTag: string; drainEndsAt: string; startedAt: string} + | {status: 'executing'; targetTag: string; fromSha: string; startedAt: string} + | {status: 'pending-verification'; targetTag: string; fromSha: string; deadlineAt: string} + | {status: 'verified'; targetTag: string; verifiedAt: string} + | {status: 'rolling-back'; reason: string; targetTag: string; fromSha: string; at: string} + | {status: 'rolled-back'; reason: string; targetTag: string; restoredSha: string; at: string} + | {status: 'rollback-failed'; reason: string; targetTag: string; fromSha: string; at: string}; + +export interface UpdateStatusPayload { + // ...existing fields... + execution: Execution; + lastResult: null | { + targetTag: string; fromSha: string; + outcome: 'verified' | 'rolled-back' | 'rollback-failed' | 'preflight-failed' | 'cancelled'; + reason: string | null; at: string; + }; + lockHeld: boolean; +} +``` + +Add a log slice: + +```typescript +type StoreState = { + // ...existing... + updateLog: string; + setUpdateLog: (log: string) => void; +}; +// in create(): +updateLog: '', +setUpdateLog: (log) => set({updateLog: log}), +``` + +- [ ] **Step 2: Replace `UpdatePage.tsx`** + +Replace the `return` block of `UpdatePage` so the `ok` path renders Apply/Cancel/Acknowledge per `execution.status`: + +```tsx +const apply = async () => { + await fetch('/admin/update/apply', {method: 'POST', credentials: 'same-origin'}); + // Re-fetch status — server returned 202, the actual transition happened in the background. + const r = await fetch('/admin/update/status', {credentials: 'same-origin'}); + if (r.ok) setUpdateStatus(await r.json()); +}; +const cancel = async () => { + await fetch('/admin/update/cancel', {method: 'POST', credentials: 'same-origin'}); + const r = await fetch('/admin/update/status', {credentials: 'same-origin'}); + if (r.ok) setUpdateStatus(await r.json()); +}; +const acknowledge = async () => { + await fetch('/admin/update/acknowledge', {method: 'POST', credentials: 'same-origin'}); + const r = await fetch('/admin/update/status', {credentials: 'same-origin'}); + if (r.ok) setUpdateStatus(await r.json()); +}; + +const status = us.execution.status; +const showApply = us.policy?.canManual && (status === 'idle' || status === 'verified' || status.startsWith('rolled-back') || status === 'preflight-failed') && !us.lockHeld; +const showCancel = status === 'preflight' || status === 'draining'; +const showAcknowledge = status === 'preflight-failed' || status === 'rolled-back' || status === 'rollback-failed'; + +return ( +
+

+
+ {/* ...existing dl entries... */} +
+
{t(`update.execution.${status}`, {defaultValue: status})}
+
+ {us.lastResult && ( +

+ +

+ )} + {us.policy && !us.policy.canManual && ( +

+ +

+ )} +
+ {showApply && } + {showCancel && } + {showAcknowledge && } +
+ {/* changelog block — keep as in PR 1 */} +
+); +``` + +- [ ] **Step 3: Add the i18n keys** + +In `src/locales/en.json`, add: + +```json + "update.page.apply": "Apply update", + "update.page.cancel": "Cancel", + "update.page.acknowledge": "Acknowledge", + "update.page.execution": "Status", + "update.page.policy.install-method-not-writable": "Updates from the admin UI require a git install. Update via your package manager.", + "update.page.policy.rollback-failed-terminal": "A previous update failed and could not be rolled back. Manual intervention required; press Acknowledge to clear the lock once the install is healthy.", + "update.page.policy.up-to-date": "You are running the latest version.", + "update.page.policy.tier-off": "Updates are disabled (updates.tier = \"off\").", + "update.page.last_result.verified": "Last update to {{tag}} verified.", + "update.page.last_result.rolled-back": "Last attempted update to {{tag}} rolled back: {{reason}}.", + "update.page.last_result.rollback-failed": "Last update attempt failed AND rollback failed: {{reason}}. Manual intervention required.", + "update.page.last_result.preflight-failed": "Last attempted update to {{tag}} failed preflight: {{reason}}.", + "update.page.last_result.cancelled": "Last attempted update to {{tag}} cancelled by admin.", + "update.execution.idle": "Idle", + "update.execution.preflight": "Pre-flight checks", + "update.execution.preflight-failed": "Pre-flight failed", + "update.execution.draining": "Draining sessions", + "update.execution.executing": "Updating...", + "update.execution.pending-verification": "Pending verification", + "update.execution.verified": "Verified", + "update.execution.rolling-back": "Rolling back", + "update.execution.rolled-back": "Rolled back", + "update.execution.rollback-failed": "Rollback failed", + "update.banner.terminal.rollback-failed": "An update attempt failed and could not be rolled back. Manual intervention required.", + "update.drain.t60": "Etherpad will restart in 60 seconds to apply an update.", + "update.drain.t30": "Etherpad will restart in 30 seconds to apply an update.", + "update.drain.t10": "Etherpad will restart in 10 seconds to apply an update." +``` + +- [ ] **Step 4: Build the admin UI and visit it locally** + +```bash +pnpm install # ensure admin deps in case anything is missing +pnpm --filter admin run build +pnpm run dev -- --port 9003 & +# In a browser: http://localhost.lan:9003/admin/update — log in as admin +# Verify the Apply button renders when latest version differs from current +kill %1 +``` + +> Don't kill the apply manually after pressing it on a real install — the update will actually run. Use `pnpm run dev` in a disposable worktree if you want to test the full apply path. + +- [ ] **Step 5: Commit** + +```bash +git add admin/src/pages/UpdatePage.tsx admin/src/store/store.ts src/locales/en.json +git commit -m "$(cat <<'EOF' +feat(updater): admin UI Apply/Cancel/Acknowledge buttons + +UpdatePage renders the right action set per execution.status, surfaces +lastResult with localised copy, and shows policy denial reasons (e.g. +install-method-not-writable, rollback-failed-terminal). Buttons round- +trip status through /admin/update/status after each action. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 13: Admin UI — log stream view + +**Files:** +- Modify: `admin/src/pages/UpdatePage.tsx` + +While `execution.status === 'preflight' | 'draining' | 'executing' | 'rolling-back'`, poll `/admin/update/log` once a second and render the tail in a `
`. Stop polling when the status leaves the set.
+
+- [ ] **Step 1: Add the polling effect**
+
+Inside `UpdatePage`, after the existing `useEffect` for `/admin/update/status`, add:
+
+```tsx
+const log = useStore((s) => s.updateLog);
+const setLog = useStore((s) => s.setUpdateLog);
+const inFlight = ['preflight', 'draining', 'executing', 'rolling-back'].includes(us?.execution?.status ?? '');
+useEffect(() => {
+  if (!inFlight) return;
+  let cancelled = false;
+  const tick = async () => {
+    if (cancelled) return;
+    try {
+      const r = await fetch('/admin/update/log', {credentials: 'same-origin'});
+      if (r.ok) setLog(await r.text());
+      // Re-fetch status too so we know when to stop polling.
+      const s = await fetch('/admin/update/status', {credentials: 'same-origin'});
+      if (s.ok) setUpdateStatus(await s.json());
+    } catch {/* noop */}
+    if (!cancelled) setTimeout(tick, 1000);
+  };
+  tick();
+  return () => { cancelled = true; };
+}, [inFlight, setLog, setUpdateStatus]);
+```
+
+In the JSX:
+
+```tsx
+{inFlight && (
+  
+

+
{log}
+
+)} +``` + +- [ ] **Step 2: Add i18n key** + +In `src/locales/en.json`: + +```json + "update.page.log": "Update log (last 200 lines)" +``` + +- [ ] **Step 3: Smoke test in a browser** + +Same workflow as Task 12 step 4. Trigger an Apply on a git checkout that's safe to update (e.g., a disposable worktree). Watch the log block populate. + +- [ ] **Step 4: Commit** + +```bash +git add admin/src/pages/UpdatePage.tsx src/locales/en.json +git commit -m "$(cat <<'EOF' +feat(updater): admin UI streams update log while update is in flight + +While execution.status is preflight/draining/executing/rolling-back the +page polls /admin/update/log + /admin/update/status once a second, +showing the rolling tail and switching off automatically when the run +terminates. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 14: Pad-side drain announcement + +**Files:** +- Modify: `src/static/js/chat.js` or `src/static/js/pad.js` (whichever handles incoming `shoutMessage`) +- Modify: `src/locales/en.json` (already done in Task 12 — verify keys exist) + +`broadcastShout` in Task 11 sends a shoutMessage payload of the form `{message: {message: 'update.drain.t60', values: {seconds: 60}}, ...}`. The pad client renders shouts via the existing chat pipeline. We need that pipeline to look up `payload.message.message` as a translation key when present and substitute `payload.message.values`. + +- [ ] **Step 1: Find the shout-rendering site** + +```bash +grep -rn "shoutMessage\|payload.message" src/static/js/ | head -20 +``` + +Locate the function that turns the COLLABROOM shoutMessage into chat text. In Etherpad core that lives in `src/static/js/pad.js` or `src/static/js/chat.js` — search for `shoutMessage`. + +- [ ] **Step 2: Extend the renderer to handle i18n keys** + +Wrap the existing logic so `if (typeof payload.message.message === 'string' && payload.message.message.startsWith('update.drain.'))` is rendered through `html10n.translations` lookup; otherwise fall back to current behaviour. Concrete patch (adapt to actual code): + +```javascript +// existing: +// const text = payload.message.message; +// becomes: +const raw = payload.message.message; +const values = payload.message.values || {}; +let text = raw; +if (typeof raw === 'string' && raw.startsWith('update.drain.') && window.html10n && window.html10n.translations) { + const tpl = window.html10n.translations[raw]; + if (typeof tpl === 'string') { + text = tpl.replace(/\{\{(\w+)\}\}/g, (_, k) => String(values[k] ?? '')); + } +} +``` + +(`html10n.get(raw, values)` is the bound API but `window._` is unbound per memory `project_plugin_window_underscore_audit.md` — go through `window.html10n.translations` directly to dodge that bug.) + +- [ ] **Step 3: Add a Playwright test** + +In `src/tests/frontend-new/specs/`, add a spec that opens a pad, simulates a shout from the admin socket via the existing admin shout test pattern (`grep -rn "shout" src/tests/frontend-new/`) — if no harness exists, skip this Playwright test and rely on the manual smoke step below. **Do not write a fake test.** + +- [ ] **Step 4: Manual smoke test** + +```bash +pnpm run dev -- --port 9003 & +# Open http://localhost.lan:9003/p/test-drain in one tab +# In another tab, log in to /admin and use the Shout feature to send "update.drain.t60" +# Verify the pad shows "Etherpad will restart in 60 seconds..." +kill %1 +``` + +If the manual test fails — i.e., the pad shows the literal key — adjust the renderer in step 2 until the pad shows the localised string. Per memory `feedback_test_localized_strings`, do not declare done while the literal key shows. + +- [ ] **Step 5: Commit** + +```bash +git add src/static/js/chat.js src/static/js/pad.js +git commit -m "$(cat <<'EOF' +feat(updater): pad shoutMessage renders update.drain.* via html10n + +When the executor's drain phase broadcasts update.drain.t60/t30/t10, +pads render the localised string instead of the bare i18n key. Goes +through html10n.translations directly to dodge the unbound window._ +bug documented in project_plugin_window_underscore_audit. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 15: Integration test — end-to-end against a tmp git repo + +**Files:** +- Create: `src/tests/backend/specs/updater-integration.ts` + +This is the highest-value test in the plan: it runs `executeUpdate` against a real tmp git repo, verifying happy path + each rollback variant by stubbing only the steps that would mutate the *current* install (we replace `pnpm install` with a `bash -c 'exit 0'` and similar). The test is deliberately heavy — run it on its own, not in the unit-test loop. + +- [ ] **Step 1: Skeleton failing test** + +Create `src/tests/backend/specs/updater-integration.ts`: + +```typescript +'use strict'; + +const assert = require('assert').strict; +import {execSync, spawn} from 'node:child_process'; +import fs from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import {executeUpdate} from '../../../node/updater/UpdateExecutor'; +import {performRollback, checkPendingVerification} from '../../../node/updater/RollbackHandler'; +import {EMPTY_STATE} from '../../../node/updater/types'; + +const sh = (cmd: string, opts: any = {}) => execSync(cmd, {stdio: 'pipe', ...opts}).toString().trim(); + +const buildTmpRepo = async (): Promise => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-it-')); + sh('git init -b main', {cwd: dir}); + sh('git config user.email test@example.com', {cwd: dir}); + sh('git config user.name test', {cwd: dir}); + await fs.writeFile(path.join(dir, 'pnpm-lock.yaml'), 'lockfileVersion: x\n'); + sh('git add . && git commit -m initial', {cwd: dir}); + sh('git tag v0.0.1', {cwd: dir}); + await fs.writeFile(path.join(dir, 'pnpm-lock.yaml'), 'lockfileVersion: y\n'); + sh('git add . && git commit -m bump', {cwd: dir}); + sh('git tag v0.0.2', {cwd: dir}); + // executor expects an "origin" — point it at the same dir for the ls-remote check. + sh(`git remote add origin ${dir}`, {cwd: dir}); + return dir; +}; + +const stubSpawn = (overrides: Record = {}) => { + // Emulate spawn for everything by mapping (cmd, args) -> exit code. + return ((cmd: string, args: string[]) => { + const key = `${cmd} ${args.join(' ')}`; + const exit = overrides[key] ?? (cmd === 'pnpm' ? 0 : -1); // -1 means "use real git" + if (exit === -1) { + // Real git for this step. + const real = spawn(cmd, args, {cwd: (overrides as any).__cwd, stdio: ['ignore', 'pipe', 'pipe']}); + return real; + } + return { + stdout: {on: () => {}}, stderr: {on: () => {}}, + on: (e: string, cb: any) => e === 'close' && setImmediate(() => cb(exit)), + } as any; + }) as any; +}; + +describe(__filename, function () { + this.timeout(20_000); + + it('happy path: executes against tmp repo, lands on pending-verification', async () => { + const repo = await buildTmpRepo(); + const states: any[] = []; + let exited: number | null = null; + const r = await executeUpdate({ + repoDir: repo, + backupDir: path.join(repo, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0, 'pnpm run build:ui': 0, __cwd: repo} as any), + readSha: async () => sh('git rev-parse HEAD', {cwd: repo}), + copyFile: (s, d) => fs.mkdir(path.dirname(d), {recursive: true}).then(() => fs.copyFile(s, d)), + saveState: async (s) => { states.push(structuredClone(s)); }, + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v0.0.2', + now: () => new Date(), + exit: (code) => { exited = code; }, + }); + assert.equal(r.outcome, 'pending-verification'); + assert.equal(exited, 75); + assert.equal(states.at(-1).execution.status, 'pending-verification'); + // Backup file exists. + await fs.access(path.join(repo, 'var', 'update-backup', 'pnpm-lock.yaml')); + await fs.rm(repo, {recursive: true, force: true}); + }); + + it('install failure rolls back to original SHA', async () => { + const repo = await buildTmpRepo(); + const original = sh('git rev-parse HEAD', {cwd: repo}); + let exited: number | null = null; + const states: any[] = []; + + // Phase 1: executor with failing install. + await executeUpdate({ + repoDir: repo, backupDir: path.join(repo, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 1, __cwd: repo} as any), + readSha: async () => sh('git rev-parse HEAD', {cwd: repo}), + copyFile: (s, d) => fs.mkdir(path.dirname(d), {recursive: true}).then(() => fs.copyFile(s, d)), + saveState: async (s) => { states.push(structuredClone(s)); }, + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v0.0.2', + now: () => new Date(), + exit: (c) => { exited = c; }, + }); + assert.equal(states.at(-1).execution.status, 'rolling-back'); + + // Phase 2: rollback. + await performRollback(states.at(-1), { + repoDir: repo, backupDir: path.join(repo, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0, __cwd: repo} as any), + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: (c) => { exited = c; }, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(states.at(-1).execution.status, 'rolled-back'); + assert.equal(sh('git rev-parse HEAD', {cwd: repo}), original); + assert.equal(exited, 75); + await fs.rm(repo, {recursive: true, force: true}); + }); + + // Add: build-failure rollback (same as install-failure but with build:ui exit 1). + // Add: crash-loop guard (state.bootCount = 3 forces immediate rollback in checkPendingVerification). +}); +``` + +- [ ] **Step 2: Run — confirm fail / pass** + +Run: `pnpm run test -- --grep updater-integration` +Expected: PASS for the two scenarios above; if not, debug — typical issues are `git ls-remote --tags` against a self-origin which needs `git push origin v0.0.2` first; add it inside `buildTmpRepo`. + +- [ ] **Step 3: Add the build-failure + crash-loop scenarios** + +Append: + +```typescript + it('build failure rolls back to original SHA', async () => { /* same as install but spawnFn returns build:ui=1, install=0 */ }); + + it('crash-loop guard forces rollback when bootCount > 2', async () => { + const repo = await buildTmpRepo(); + const original = sh('git rev-parse HEAD', {cwd: repo}); + sh('git checkout v0.0.2', {cwd: repo}); + // pretend we're already on v0.0.2 (post-update boot) and the lockfile backup exists. + await fs.mkdir(path.join(repo, 'var', 'update-backup'), {recursive: true}); + await fs.copyFile(path.join(repo, 'pnpm-lock.yaml'), path.join(repo, 'var', 'update-backup', 'pnpm-lock.yaml')); + sh(`git checkout ${original}`, {cwd: repo}); + sh(`cp var/update-backup/pnpm-lock.yaml pnpm-lock.yaml`, {cwd: repo}); + sh('git checkout v0.0.2', {cwd: repo}); + + let exited: number | null = null; + const states: any[] = []; + const state = { + ...structuredClone(EMPTY_STATE), + execution: {status: 'pending-verification', targetTag: 'v0.0.2', fromSha: original, deadlineAt: '2026-05-08T10:00:00Z'} as const, + bootCount: 3, + }; + const r = checkPendingVerification(state, { + repoDir: repo, backupDir: path.join(repo, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0, __cwd: repo} as any), + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: (c) => { exited = c; }, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(r.armed, false); + // Wait a tick for the async rollback to finish. + await new Promise((r) => setImmediate(r)); + assert.equal(states.at(-1).execution.status, 'rolled-back'); + assert.equal(sh('git rev-parse HEAD', {cwd: repo}), original); + assert.equal(exited, 75); + await fs.rm(repo, {recursive: true, force: true}); + }); +``` + +- [ ] **Step 4: Run all integration tests** + +Run: `pnpm run test -- --grep "updater-integration|updateActions|updateStatus"` +Expected: PASS for everything. + +- [ ] **Step 5: Commit** + +```bash +git add src/tests/backend/specs/updater-integration.ts +git commit -m "$(cat <<'EOF' +test(updater): integration suite over a tmp git repo + +Exercises executeUpdate + performRollback + checkPendingVerification +end-to-end against a disposable git repo with two tagged commits: +happy path -> pending-verification, install-fail rollback, build-fail +rollback, crash-loop bootCount>2 forced rollback. Runs with mocha at +20s timeout; no real pnpm/network. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 16: Playwright spec — admin Apply flow + +**Files:** +- Create: `src/tests/frontend-new/admin-spec/update-page-actions.spec.ts` + +The Playwright spec stubs the network: it intercepts `/admin/update/status` to seed a fake `latest`, intercepts `/admin/update/apply` to return `202`, and verifies the UI transitions through the right buttons. We do *not* actually run an update — that's covered by the manual smoke runbook. + +- [ ] **Step 1: Failing spec** + +Create `src/tests/frontend-new/admin-spec/update-page-actions.spec.ts`: + +```typescript +import {expect, test} from '@playwright/test'; + +const baseStatus = { + currentVersion: '2.7.1', + latest: {version: '2.7.2', tag: 'v2.7.2', body: 'release notes', publishedAt: '2026-05-01T00:00:00Z', prerelease: false, htmlUrl: 'https://example/'}, + lastCheckAt: '2026-05-08T00:00:00Z', + installMethod: 'git', + tier: 'manual', + policy: {canNotify: true, canManual: true, canAuto: false, canAutonomous: false, reason: 'ok'}, + vulnerableBelow: [], + execution: {status: 'idle'}, + lastResult: null, + lockHeld: false, +}; + +test('admin Apply button posts to /admin/update/apply and re-fetches status', async ({page}) => { + let posted = false; + await page.route('**/admin/update/status', (route) => route.fulfill({json: baseStatus})); + await page.route('**/admin/update/apply', (route) => { posted = true; route.fulfill({status: 202, json: {accepted: true}}); }); + await page.goto('/admin/update'); + await expect(page.getByRole('button', {name: /apply update/i})).toBeVisible(); + await page.getByRole('button', {name: /apply update/i}).click(); + await expect.poll(() => posted).toBe(true); +}); + +test('install-method-not-writable hides Apply and shows the policy reason', async ({page}) => { + const denied = {...baseStatus, installMethod: 'docker', + policy: {canNotify: true, canManual: false, canAuto: false, canAutonomous: false, reason: 'install-method-not-writable'}}; + await page.route('**/admin/update/status', (route) => route.fulfill({json: denied})); + await page.goto('/admin/update'); + await expect(page.getByRole('button', {name: /apply update/i})).toHaveCount(0); + await expect(page.getByText(/Updates from the admin UI require a git install/i)).toBeVisible(); +}); + +test('rollback-failed shows Acknowledge button', async ({page}) => { + const terminal = {...baseStatus, + execution: {status: 'rollback-failed', reason: 'pnpm install failed; rollback failed: pnpm exit 1', targetTag: 'v2.7.2', fromSha: 'x', at: '2026-05-08T00:00:00Z'}, + lastResult: {targetTag: 'v2.7.2', fromSha: 'x', outcome: 'rollback-failed', reason: 'pnpm install failed', at: '2026-05-08T00:00:00Z'}}; + await page.route('**/admin/update/status', (route) => route.fulfill({json: terminal})); + await page.goto('/admin/update'); + await expect(page.getByRole('button', {name: /acknowledge/i})).toBeVisible(); +}); +``` + +- [ ] **Step 2: Run** + +```bash +pnpm run test-ui -- src/tests/frontend-new/admin-spec/update-page-actions.spec.ts +``` + +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add src/tests/frontend-new/admin-spec/update-page-actions.spec.ts +git commit -m "$(cat <<'EOF' +test(updater): Playwright admin Apply flow + policy denial + acknowledge + +Stubs /admin/update/status and /admin/update/apply at the route level so +we can assert UI transitions (button visibility, policy-denial copy, +terminal-state acknowledge) without actually running an update. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 17: Banner copy for terminal states + +**Files:** +- Modify: `admin/src/components/UpdateBanner.tsx` + +When `execution.status === 'rollback-failed'`, the banner text should be the strong `update.banner.terminal.rollback-failed` copy and link to `/update`. + +- [ ] **Step 1: Patch the banner** + +Replace the JSX so it picks the right key: + +```tsx +if (!updateStatus) return null; +const exec = updateStatus.execution?.status; +if (exec === 'rollback-failed') { + return ( +
+ {' '} + {t('update.banner.cta')} +
+ ); +} +if (!updateStatus.latest || updateStatus.currentVersion === updateStatus.latest.version) return null; +// existing ok-banner... +``` + +- [ ] **Step 2: Manual visual test** + +Seed the state file (`var/update-state.json`) with `execution.status: 'rollback-failed'` then load `/admin/update`. Confirm the banner copy matches `update.banner.terminal.rollback-failed`, not the literal key. Per memory `feedback_test_localized_strings`, fail the task if the literal key shows. + +- [ ] **Step 3: Commit** + +```bash +git add admin/src/components/UpdateBanner.tsx +git commit -m "$(cat <<'EOF' +feat(updater): admin banner shows rollback-failed terminal state + +When execution.status is rollback-failed, the banner switches to a +role=alert with stronger copy, regardless of whether a new release is +known. Other terminal states (preflight-failed, rolled-back) surface on +the page itself, not the banner — they're informational, not urgent. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 18: Documentation + smoke runbook + +**Files:** +- Modify: `doc/admin/updates.md` +- Modify: `CHANGELOG.md` +- Create: `docs/superpowers/specs/2026-04-25-auto-update-runbook.md` + +The spec's "Phased rollout / PR 2" entry calls out a runbook ("manual smoke runbook in `docs/superpowers/specs/2026-04-25-auto-update-runbook.md`, run before each tier ships, against a disposable VM"). This task ships it alongside the user-facing docs. + +- [ ] **Step 0: Write the smoke runbook** + +Create `docs/superpowers/specs/2026-04-25-auto-update-runbook.md` covering: + +1. Provisioning a disposable Ubuntu/Debian VM with systemd + a checked-out git install. +2. Setting `updates.tier: "manual"` in `settings.json`. +3. Booting under systemd with `Restart=on-failure` + `RestartSec=5` (sample unit file inline). +4. Forcing a downgrade by `git checkout` of the previous tag, restart, confirm Apply button shows. +5. Apply, observe drain broadcasts in a separate pad, observe restart, observe verified state. +6. Forcing rollback: corrupt `pnpm-lock.yaml` between checkout and install (or pin to a tag with a known-broken build), Apply, observe rolled-back state. +7. Forcing rollback-failed: also break the backup lockfile, Apply, observe terminal state and Acknowledge flow. +8. Crash-loop guard: pin a tag whose code throws on boot, Apply, observe bootCount climb to 3 + forced rollback. +9. Sign-off checklist: every observable transition matches `docs/superpowers/specs/2026-04-25-auto-update-design.md` "State machine". + +- [ ] **Step 1: Append Tier 2 section to `doc/admin/updates.md`** + +Document: +- Activation: `updates.tier: "manual"` requires a `git` install. +- Process supervisor required (systemd/pm2/docker restart-policy) — Etherpad exits 75 to trigger restart. +- Apply flow: button → preflight → 60s drain (broadcasts at T-60/-30/-10) → fetch/checkout/install/build → exit → restart → 60s health check. +- Rollback paths: install/build failure, health-check timeout, crash loop (>2 reboots). +- Terminal states: `preflight-failed` and `rolled-back` are informational; `rollback-failed` requires `POST /admin/update/acknowledge` after manual recovery. +- Settings: each new key with default + when to change. +- Signature verification: opt-in via `requireSignature: true`; document GNUPGHOME path. +- What is *not* covered: Tier 3 (auto) and Tier 4 (autonomous) ship later. + +- [ ] **Step 2: Add to `CHANGELOG.md` Unreleased** + +```markdown +### Updater +- Tier 2 (manual click): admins can now apply updates from `/admin/update` on git installs. Requires a process supervisor; the executor exits 75 to trigger restart, and the next boot runs a 60s health check that auto-rolls back on failure. Tags are signature-checked when `updates.requireSignature: true`. New settings: `updates.preApplyGraceMinutes`, `drainSeconds`, `rollbackHealthCheckSeconds`, `diskSpaceMinMB`, `requireSignature`, `trustedKeysPath`. +``` + +- [ ] **Step 3: Commit** + +```bash +git add doc/admin/updates.md CHANGELOG.md docs/superpowers/specs/2026-04-25-auto-update-runbook.md +git commit -m "$(cat <<'EOF' +docs(updater): document Tier 2 manual-click flow + smoke runbook + +Adds doc/admin/updates.md Tier 2 section: prerequisites (git install + +process supervisor), Apply flow with timings, rollback paths, terminal +states + acknowledge, signature-verification opt-in. Ships the manual +smoke runbook the design spec calls for: disposable VM, systemd unit, +forced rollback / rollback-failed / crash-loop scenarios. Notes Tier 3/4 +are deferred to follow-up PRs. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 19: Final sanity sweep + open PR + +**Files:** none (workflow only). + +- [ ] **Step 1: Full type check + tests** + +```bash +pnpm run ts-check +pnpm vitest run src/tests/backend-new/specs/updater +pnpm run test -- --grep "updater|updateActions|updateStatus" +pnpm run test-ui -- src/tests/frontend-new/admin-spec/update-page-actions.spec.ts +pnpm --filter admin run build +``` + +Expected: every step PASS. + +- [ ] **Step 2: Push branch** + +```bash +git push -u origin feat/7607-auto-update-tier2-manual-click +``` + +- [ ] **Step 3: Open PR against `develop`** + +```bash +gh pr create --base develop --title "feat(updater): tier 2 — manual-click update from /admin/update (#7607)" --body "$(cat <<'EOF' +## Summary + +Ships **Tier 2 (manual click)** of the four-tier auto-update design at +`docs/superpowers/specs/2026-04-25-auto-update-design.md`. Builds on PR #7601 +(Tier 1 — notify, merged 2026-05-01). + +- Admins on git installs see an **Apply update** button at `/admin/update`. +- Click flow: pre-flight checks → 60s drain (with T-60/-30/-10 pad broadcasts) → `git fetch / checkout / pnpm install --frozen-lockfile / pnpm run build:ui` → exit 75 for the supervisor to restart. +- 60s health-check on the next boot. On crash loop (bootCount > 2) or health-check timeout we restore the prior SHA + lockfile and exit 75 again. +- Terminal `rollback-failed` state surfaces a strong banner; admin clicks **Acknowledge** to clear after manual recovery. +- New settings under `updates.*`: `preApplyGraceMinutes`, `drainSeconds`, `rollbackHealthCheckSeconds`, `diskSpaceMinMB`, `requireSignature`, `trustedKeysPath` (all opt-in / sane defaults). +- Signature verification (`requireSignature`) is opt-in and stub-friendly: false → log warning and pass; true → `git verify-tag ` against the user keyring (or `trustedKeysPath` via `GNUPGHOME`). Etherpad's release process does not yet sign tags consistently — turning on by default would break Tier 2 for everyone, so this is documented as follow-up. + +Tier 3 (auto with grace window) and Tier 4 (autonomous within maintenance window) are out of scope for this PR. + +## Architecture + +- New atomic units under `src/node/updater/`: `lock` (PID file), `trustedKeys` (gpg via git verify-tag), `preflight` (sequenced check pipeline), `UpdateExecutor` (DI-spawn pipeline), `RollbackHandler` (boot health-timer + crash-loop guard), `SessionDrainer` (timed broadcasts + accept-flag), `updateLog` (rolling appender + tail). +- New routes in `src/node/hooks/express/updateActions.ts`: `POST /admin/update/{apply,cancel,acknowledge}`, `GET /admin/update/log` — strict admin auth. +- `RollbackHandler.checkPendingVerification` wires into boot in `src/node/updater/index.ts`; `markBootHealthy` is called from `src/node/server.ts` after state hits `RUNNING`. +- Admin UI: `UpdatePage` renders Apply/Cancel/Acknowledge per `execution.status`, polls `/admin/update/log` while in flight, surfaces lastResult and policy denial copy. Banner adds a terminal-state alert variant. +- Pad UI: existing shoutMessage pipeline learns to render `update.drain.t60/t30/t10` keys via `html10n.translations` (avoids the unbound `window._` bug). + +## Test plan + +- [x] `pnpm vitest run src/tests/backend-new/specs/updater` — unit suite (lock, preflight, trustedKeys, UpdateExecutor, RollbackHandler, SessionDrainer, updateLog, drainer-handshake, UpdatePolicy, index-boot, state) +- [x] `pnpm run test --grep updateActions` — mocha API tests for the four new endpoints (auth, policy, terminal-state acknowledge) +- [x] `pnpm run test --grep updater-integration` — end-to-end against a tmp git repo: happy path, install-fail rollback, build-fail rollback, crash-loop forced rollback +- [x] `pnpm run test-ui -- src/tests/frontend-new/admin-spec/update-page-actions.spec.ts` — Playwright Apply / policy denial / Acknowledge +- [x] Manual smoke: drain announcement renders the localised string in a real pad +- [x] `pnpm run ts-check` clean, `pnpm --filter admin run build` clean + +## Notes + +- Process supervisor is a hard requirement for Tier 2. Documented in `doc/admin/updates.md`. +- Tag signature verification is opt-in pending a separate "sign all releases" project. Logged as a warning when skipped. + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` + +- [ ] **Step 4: Wait for CI then check, fix anything that breaks** + +```bash +sleep 30 +gh pr checks --watch +``` + +If a check fails, pull the log, fix, push. Per memory `feedback_check_ci_after_pr`, do not move on with red CI. + +- [ ] **Step 5: Action Qodo review** + +Once Qodo posts review comments, fetch and address each per memory `feedback_qodo_pr_feedback`. + +```bash +gh pr view --comments | head -200 +``` + +--- + +## Self-review checklist (run before declaring this plan ready) + +- [ ] Every spec section under "Tier 2 — manual click", "Error handling", "Phased rollout / PR 2" has a corresponding task. +- [ ] Type names / function names are consistent across tasks (e.g., `executeUpdate`, `performRollback`, `checkPendingVerification`, `runPreflight`, `acquireLock`/`releaseLock`/`isHeld`, `createDrainer`, `tailLines`, `verifyReleaseTag`). +- [ ] No "TODO" / "TBD" / "similar to above" / "appropriate validation" placeholder steps. +- [ ] Every `bash` snippet runs without further parameter substitution. +- [ ] Every test step shows the actual test code, not "write a test for this". +- [ ] Every `git commit` step lists the exact files to add and a Conventional-Commits message with the project's standard `Co-Authored-By` footer. +- [ ] Tasks 14 and 17 require a manual visual check; that is documented as a hard gate (per memory `feedback_test_localized_strings`). +- [ ] Tier 3 / 4 are explicitly out of scope. From 7fd3f3ab3c0dcafe196c8ff555eea718359c371f Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:02:38 +0100 Subject: [PATCH 02/21] feat(updater): extend state + settings for Tier 2 manual-click Adds ExecutionStatus discriminated union, bootCount, and lastResult to UpdateState, plus the preApplyGraceMinutes/drainSeconds/diskSpaceMinMB/ requireSignature/trustedKeysPath knobs that Tier 2's executor needs. loadState backfills the new fields on Tier 1 state files so existing installs keep working. Co-Authored-By: Claude Opus 4.7 (1M context) --- settings.json.docker | 8 +- settings.json.template | 17 +++- src/node/updater/state.ts | 52 +++++++++-- src/node/updater/types.ts | 51 ++++++++++ src/node/utils/Settings.ts | 16 ++++ .../backend-new/specs/updater/state.test.ts | 92 +++++++++++++++++++ 6 files changed, 225 insertions(+), 11 deletions(-) diff --git a/settings.json.docker b/settings.json.docker index 36becc015fd..1147a88ff2d 100644 --- a/settings.json.docker +++ b/settings.json.docker @@ -218,7 +218,13 @@ "installMethod": "docker", "checkIntervalHours": 6, "githubRepo": "ether/etherpad", - "requireAdminForStatus": false + "requireAdminForStatus": false, + "preApplyGraceMinutes": 0, + "drainSeconds": 60, + "rollbackHealthCheckSeconds": 60, + "diskSpaceMinMB": 500, + "requireSignature": false, + "trustedKeysPath": null }, /* diff --git a/settings.json.template b/settings.json.template index 863286addc1..edde9eb7bea 100644 --- a/settings.json.template +++ b/settings.json.template @@ -227,7 +227,22 @@ * endpoint open (the version is already public via /health). Set true to hide * full update detail from non-admins without turning the updater off. */ - "requireAdminForStatus": false + "requireAdminForStatus": false, + /* + * Tier 2+ knobs. Only meaningful at tier "manual" or higher. + * - preApplyGraceMinutes: tier 3 only — countdown before an auto-update applies. + * - drainSeconds: how long to broadcast "restart imminent" before exiting. + * - rollbackHealthCheckSeconds: window after restart for /health to come up. + * - diskSpaceMinMB: pre-flight refuses to start an update without this much free. + * - requireSignature: refuse updates whose tag isn't signed by a trusted key. + * - trustedKeysPath: override the keyring location passed to git verify-tag (GNUPGHOME). + */ + "preApplyGraceMinutes": 0, + "drainSeconds": 60, + "rollbackHealthCheckSeconds": 60, + "diskSpaceMinMB": 500, + "requireSignature": false, + "trustedKeysPath": null }, /* diff --git a/src/node/updater/state.ts b/src/node/updater/state.ts index 05f97e8ab56..6c86d52257c 100644 --- a/src/node/updater/state.ts +++ b/src/node/updater/state.ts @@ -1,6 +1,6 @@ import fs from 'node:fs/promises'; import path from 'node:path'; -import {EMPTY_STATE, UpdateState} from './types'; +import {EMPTY_STATE, EXECUTION_STATUSES, UpdateState} from './types'; const isPlainObject = (v: unknown): v is Record => v !== null && typeof v === 'object' && !Array.isArray(v); @@ -8,6 +8,21 @@ const isPlainObject = (v: unknown): v is Record => const isStringOrNull = (v: unknown): v is string | null => v === null || typeof v === 'string'; +const isValidExecution = (v: unknown): boolean => { + if (!isPlainObject(v)) return false; + return typeof v.status === 'string' && (EXECUTION_STATUSES as readonly string[]).includes(v.status); +}; + +const isValidLastResult = (v: unknown): boolean => { + if (v === null) return true; + if (!isPlainObject(v)) return false; + return typeof v.targetTag === 'string' + && typeof v.fromSha === 'string' + && typeof v.outcome === 'string' + && (v.reason === null || typeof v.reason === 'string') + && typeof v.at === 'string'; +}; + const isValidLatest = (v: unknown): boolean => { if (v === null) return true; if (!isPlainObject(v)) return false; @@ -39,14 +54,23 @@ const isValidEmail = (v: unknown): boolean => { // Validate the full shape so loadState() actually delivers on its "safely // reset on malformed input" contract. Downstream code calls .trim() / semver // parsing on these subfields and would crash on a hand-edited file otherwise. -const isValid = (raw: unknown): raw is UpdateState => { +// +// Tier 2 fields (execution, bootCount, lastResult) MAY be absent on a state +// file written by a Tier 1 install — those are backfilled at load time. +// Present-but-malformed values still reject so a hand-edited file with +// e.g. execution.status="totally-bogus" can't poison RollbackHandler. +const isValid = (raw: unknown): raw is Partial & object => { if (!isPlainObject(raw)) return false; - return raw.schemaVersion === 1 - && isStringOrNull(raw.lastCheckAt) - && isStringOrNull(raw.lastEtag) - && isValidLatest(raw.latest) - && isValidVulnerableBelow(raw.vulnerableBelow) - && isValidEmail(raw.email); + if (raw.schemaVersion !== 1) return false; + if (!isStringOrNull(raw.lastCheckAt)) return false; + if (!isStringOrNull(raw.lastEtag)) return false; + if (!isValidLatest(raw.latest)) return false; + if (!isValidVulnerableBelow(raw.vulnerableBelow)) return false; + if (!isValidEmail(raw.email)) return false; + if (raw.execution !== undefined && !isValidExecution(raw.execution)) return false; + if (raw.bootCount !== undefined && typeof raw.bootCount !== 'number') return false; + if (raw.lastResult !== undefined && !isValidLastResult(raw.lastResult)) return false; + return true; }; /** Reads the on-disk state. Returns a fresh empty-state clone when the file is missing, malformed, or has an unknown schemaVersion. Never throws on parse errors. */ @@ -65,7 +89,17 @@ export const loadState = async (filePath: string): Promise => { return structuredClone(EMPTY_STATE); } if (!isValid(parsed)) return structuredClone(EMPTY_STATE); - return parsed; + // Backfill Tier 2 fields on a Tier 1 state file. Spread defaults first, + // parsed second so explicit values win, then explicit fallback for the + // three fields that might be undefined. + const partial = parsed as Partial; + return { + ...structuredClone(EMPTY_STATE), + ...partial, + execution: partial.execution ?? structuredClone(EMPTY_STATE.execution), + bootCount: partial.bootCount ?? 0, + lastResult: partial.lastResult ?? null, + } as UpdateState; }; /** Atomic write via tmp-then-rename. Creates parent directories as needed. */ diff --git a/src/node/updater/types.ts b/src/node/updater/types.ts index d96c8e384cc..130ab02784e 100644 --- a/src/node/updater/types.ts +++ b/src/node/updater/types.ts @@ -45,6 +45,45 @@ export interface EmailSendLog { vulnerableNewReleaseTag: string | null; } +/** + * Discriminated union mirroring the state machine in + * docs/superpowers/specs/2026-04-25-auto-update-design.md (section "State machine"). + * + * `rollback-failed` is the only terminal state that disables auto/autonomous + * attempts globally until POST /admin/update/acknowledge clears it. Manual + * remains permitted because an admin clicking Apply *is* the intervention. + */ +export type ExecutionStatus = + | {status: 'idle'} + | {status: 'preflight'; targetTag: string; startedAt: string} + | {status: 'preflight-failed'; targetTag: string; reason: string; at: string} + | {status: 'draining'; targetTag: string; drainEndsAt: string; startedAt: string} + | {status: 'executing'; targetTag: string; fromSha: string; startedAt: string} + | {status: 'pending-verification'; targetTag: string; fromSha: string; deadlineAt: string} + | {status: 'verified'; targetTag: string; verifiedAt: string} + | {status: 'rolling-back'; reason: string; targetTag: string; fromSha: string; at: string} + | {status: 'rolled-back'; reason: string; targetTag: string; restoredSha: string; at: string} + | {status: 'rollback-failed'; reason: string; targetTag: string; fromSha: string; at: string}; + +/** All recognised execution statuses — used by the state validator. */ +export const EXECUTION_STATUSES = [ + 'idle', 'preflight', 'preflight-failed', 'draining', 'executing', + 'pending-verification', 'verified', 'rolling-back', 'rolled-back', 'rollback-failed', +] as const; + +export type LastUpdateResult = { + /** Tag we were updating to. */ + targetTag: string; + /** SHA we were updating from. Empty string when the run never reached executor (e.g. preflight-failed). */ + fromSha: string; + /** Outcome to surface in admin UI. */ + outcome: 'verified' | 'rolled-back' | 'rollback-failed' | 'preflight-failed' | 'cancelled'; + /** Human-readable reason on non-success. */ + reason: string | null; + /** ISO timestamp when this result was finalised. */ + at: string; +} | null; + export interface UpdateState { /** Schema version of this file. Increment when fields change. */ schemaVersion: 1; @@ -58,6 +97,15 @@ export interface UpdateState { vulnerableBelow: VulnerableBelowDirective[]; /** Email send dedupe state. */ email: EmailSendLog; + /** Current in-flight execution state. Persisted so a restart mid-update reaches RollbackHandler. */ + execution: ExecutionStatus; + /** + * Boot counter that the RollbackHandler increments while a `pending-verification` + * status is live. > 2 means the new version crash-looped; force rollback regardless of timer. + */ + bootCount: number; + /** Most recent terminal outcome, surfaced in admin UI even after `execution` returns to idle. */ + lastResult: LastUpdateResult; } /** Zero-value initial state. Treat as immutable — spread before mutating: `{...EMPTY_STATE, lastCheckAt: x}`. */ @@ -72,4 +120,7 @@ export const EMPTY_STATE: UpdateState = { vulnerableAt: null, vulnerableNewReleaseTag: null, }, + execution: {status: 'idle'}, + bootCount: 0, + lastResult: null, }; diff --git a/src/node/utils/Settings.ts b/src/node/utils/Settings.ts index 3b5e9790f9c..f091577feec 100644 --- a/src/node/utils/Settings.ts +++ b/src/node/utils/Settings.ts @@ -331,6 +331,15 @@ export type SettingsType = { checkIntervalHours: number, githubRepo: string, requireAdminForStatus: boolean, + /** Tier 2+ knobs. Default 0 in PR 2; tier 3 makes preApplyGraceMinutes meaningful. */ + preApplyGraceMinutes: number, + drainSeconds: number, + rollbackHealthCheckSeconds: number, + diskSpaceMinMB: number, + /** When true, refuse updates whose tag is not signed by a trusted key. */ + requireSignature: boolean, + /** Override the OS keyring location (passed to git verify-tag via $GNUPGHOME). */ + trustedKeysPath: string | null, }, adminEmail: string | null, getPublicSettings: () => Pick, @@ -515,6 +524,13 @@ const settings: SettingsType = { // Set true to require an authenticated admin session for the endpoint without // disabling the updater itself. requireAdminForStatus: false, + // Tier 2+ knobs. Only meaningful at tier "manual" or higher. + preApplyGraceMinutes: 0, + drainSeconds: 60, + rollbackHealthCheckSeconds: 60, + diskSpaceMinMB: 500, + requireSignature: false, + trustedKeysPath: null, }, /** * Contact address for admin notifications (updates, future security advisories). diff --git a/src/tests/backend-new/specs/updater/state.test.ts b/src/tests/backend-new/specs/updater/state.test.ts index 266e895fc8c..fc87aac7f66 100644 --- a/src/tests/backend-new/specs/updater/state.test.ts +++ b/src/tests/backend-new/specs/updater/state.test.ts @@ -117,3 +117,95 @@ describe('saveState', () => { expect(data.schemaVersion).toBe(1); }); }); + +describe('Tier 2 state extensions', () => { + it('EMPTY_STATE carries an idle execution block, bootCount 0, no lastResult', () => { + expect(EMPTY_STATE.execution).toEqual({status: 'idle'}); + expect(EMPTY_STATE.bootCount).toBe(0); + expect(EMPTY_STATE.lastResult).toBeNull(); + }); + + it('loadState backfills missing Tier 2 fields on a Tier 1 file', async () => { + // Hand-write a Tier 1 state file (no execution / bootCount / lastResult). + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, + lastCheckAt: '2026-05-01T00:00:00Z', + lastEtag: 'W/"abc"', + latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + })); + const state = await loadState(statePath()); + expect(state.execution).toEqual({status: 'idle'}); + expect(state.bootCount).toBe(0); + expect(state.lastResult).toBeNull(); + // Tier 1 fields preserved. + expect(state.lastCheckAt).toBe('2026-05-01T00:00:00Z'); + expect(state.lastEtag).toBe('W/"abc"'); + }); + + it('rejects a malformed execution block by resetting to EMPTY_STATE', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: 'not-an-object', + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('rejects an unknown execution status by resetting to EMPTY_STATE', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'totally-bogus'}, + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('rejects a non-numeric bootCount by resetting to EMPTY_STATE', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'idle'}, + bootCount: 'one', + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('round-trips a pending-verification execution', async () => { + const s = { + ...EMPTY_STATE, + execution: { + status: 'pending-verification' as const, + targetTag: 'v2.7.3', + fromSha: 'abc123', + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 1, + }; + await saveState(statePath(), s); + const loaded = await loadState(statePath()); + expect(loaded.execution.status).toBe('pending-verification'); + expect(loaded.bootCount).toBe(1); + }); + + it('round-trips a non-null lastResult', async () => { + const s = { + ...EMPTY_STATE, + lastResult: { + targetTag: 'v2.7.3', fromSha: 'abc', + outcome: 'verified' as const, reason: null, + at: '2026-05-08T10:00:00Z', + }, + }; + await saveState(statePath(), s); + const loaded = await loadState(statePath()); + expect(loaded.lastResult?.outcome).toBe('verified'); + }); +}); From a575664d33b85b01b92d2d763518d50bf8df2689 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:03:26 +0100 Subject: [PATCH 03/21] feat(updater): PID-based update.lock with stale-pid reaping Single-flight guard for Tier 2's UpdateExecutor. Atomic O_CREAT|O_EXCL acquire; on EEXIST, sends signal 0 to the recorded PID and reaps if dead. Unparseable / partially-written lock files are treated as stale rather than fatal so a half-written lock from a SIGKILL'd parent doesn't lock the install out forever. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/lock.ts | 78 +++++++++++++++++++ .../backend-new/specs/updater/lock.test.ts | 69 ++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 src/node/updater/lock.ts create mode 100644 src/tests/backend-new/specs/updater/lock.test.ts diff --git a/src/node/updater/lock.ts b/src/node/updater/lock.ts new file mode 100644 index 00000000000..2dd00e5cbd2 --- /dev/null +++ b/src/node/updater/lock.ts @@ -0,0 +1,78 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +interface LockFile { + pid: number; + at: string; +} + +const isPidLive = (pid: number): boolean => { + try { + process.kill(pid, 0); + return true; + } catch (err: any) { + // ESRCH = no such process (stale). + // EPERM = exists but we can't signal — treat as live (some other user owns it). + return err.code !== 'ESRCH'; + } +}; + +const readIfPresent = async (lockPath: string): Promise => { + let raw: string; + try { + raw = await fs.readFile(lockPath, 'utf8'); + } catch (err: any) { + if (err.code === 'ENOENT') return null; + return null; + } + let parsed: unknown; + try { parsed = JSON.parse(raw); } catch { return null; } + if (!parsed || typeof parsed !== 'object') return null; + const p = parsed as Record; + if (typeof p.pid !== 'number' || typeof p.at !== 'string') return null; + return {pid: p.pid, at: p.at}; +}; + +/** + * Atomic acquire via O_CREAT|O_EXCL. If the file already exists, the holder's + * PID is checked; when dead we reap it and retry once. Returns false on a live + * conflict — the caller is expected to surface "lock-held" to the admin. + */ +export const acquireLock = async (lockPath: string): Promise => { + await fs.mkdir(path.dirname(lockPath), {recursive: true}); + const payload = JSON.stringify({pid: process.pid, at: new Date().toISOString()}); + + const tryCreate = async (): Promise => { + try { + const fh = await fs.open(lockPath, 'wx'); + try { await fh.writeFile(payload); } finally { await fh.close(); } + return true; + } catch (err: any) { + if (err.code === 'EEXIST') return false; + throw err; + } + }; + + if (await tryCreate()) return true; + + const existing = await readIfPresent(lockPath); + if (existing && isPidLive(existing.pid)) return false; + + // Stale or unparseable — reap and retry once. A concurrent reaper may beat us, + // in which case the second tryCreate also returns false (correctly: someone + // else holds it now). + try { await fs.unlink(lockPath); } + catch (err: any) { if (err.code !== 'ENOENT') throw err; } + return tryCreate(); +}; + +export const releaseLock = async (lockPath: string): Promise => { + try { await fs.unlink(lockPath); } + catch (err: any) { if (err.code !== 'ENOENT') throw err; } +}; + +/** True iff the lock file exists *and* the recorded PID is live. Stale locks read as not-held. */ +export const isHeld = async (lockPath: string): Promise => { + const f = await readIfPresent(lockPath); + return !!f && isPidLive(f.pid); +}; diff --git a/src/tests/backend-new/specs/updater/lock.test.ts b/src/tests/backend-new/specs/updater/lock.test.ts new file mode 100644 index 00000000000..adf3c61bf99 --- /dev/null +++ b/src/tests/backend-new/specs/updater/lock.test.ts @@ -0,0 +1,69 @@ +import {describe, it, expect, beforeEach, afterEach} from 'vitest'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import {acquireLock, releaseLock, isHeld} from '../../../../node/updater/lock'; + +describe('update lock', () => { + let dir: string; + let lockPath: string; + + beforeEach(async () => { + dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-lock-')); + lockPath = path.join(dir, 'update.lock'); + }); + + afterEach(async () => { + await fs.rm(dir, {recursive: true, force: true}); + }); + + it('acquires and releases', async () => { + expect(await acquireLock(lockPath)).toBe(true); + expect(await isHeld(lockPath)).toBe(true); + await releaseLock(lockPath); + expect(await isHeld(lockPath)).toBe(false); + }); + + it('rejects a second acquire while live', async () => { + expect(await acquireLock(lockPath)).toBe(true); + expect(await acquireLock(lockPath)).toBe(false); + await releaseLock(lockPath); + }); + + it('reaps a stale lock whose PID is gone', async () => { + // Pick a PID that almost certainly does not exist. process.kill(pid, 0) on + // a free PID returns ESRCH which the implementation treats as stale. + await fs.writeFile(lockPath, JSON.stringify({pid: 2147483646, at: new Date().toISOString()})); + expect(await acquireLock(lockPath)).toBe(true); + await releaseLock(lockPath); + }); + + it('treats an unparseable lock file as stale', async () => { + await fs.writeFile(lockPath, 'garbage'); + expect(await acquireLock(lockPath)).toBe(true); + await releaseLock(lockPath); + }); + + it('treats a lock missing required fields as stale', async () => { + await fs.writeFile(lockPath, JSON.stringify({somethingElse: true})); + expect(await acquireLock(lockPath)).toBe(true); + await releaseLock(lockPath); + }); + + it('release is idempotent (no error when file absent)', async () => { + await releaseLock(lockPath); // file never existed + expect(await isHeld(lockPath)).toBe(false); + }); + + it('isHeld returns false for a stale lock', async () => { + await fs.writeFile(lockPath, JSON.stringify({pid: 2147483646, at: new Date().toISOString()})); + expect(await isHeld(lockPath)).toBe(false); + }); + + it('creates parent directory if missing', async () => { + const nested = path.join(dir, 'a', 'b', 'update.lock'); + expect(await acquireLock(nested)).toBe(true); + expect(await isHeld(nested)).toBe(true); + await releaseLock(nested); + }); +}); From 04185f168f7c3182df7c45037c766a93be352b31 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:04:06 +0100 Subject: [PATCH 04/21] =?UTF-8?q?feat(updater):=20verifyReleaseTag=20?= =?UTF-8?q?=E2=80=94=20gpg-via-git=20stub=20for=20Tier=202=20preflight?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default updates.requireSignature=false: log a warning and return ok with reason=signature-not-required. Set true to make preflight refuse a tag whose signature does not verify under the system keyring (or trustedKeysPath via GNUPGHOME). Etherpad's release process does not yet sign tags consistently; turning the check on by default would break Tier 2 for every admin and forcing a release-signing change is out of scope for this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/trustedKeys.ts | 53 ++++++++++++ .../specs/updater/trustedKeys.test.ts | 83 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 src/node/updater/trustedKeys.ts create mode 100644 src/tests/backend-new/specs/updater/trustedKeys.test.ts diff --git a/src/node/updater/trustedKeys.ts b/src/node/updater/trustedKeys.ts new file mode 100644 index 00000000000..03b9ceff772 --- /dev/null +++ b/src/node/updater/trustedKeys.ts @@ -0,0 +1,53 @@ +import {spawn as realSpawn, SpawnOptions} from 'node:child_process'; +import log4js from 'log4js'; + +const logger = log4js.getLogger('updater'); + +export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { + on: (event: 'close', cb: (code: number | null) => void) => void; +}; + +export interface VerifyArgs { + tag: string; + repoDir: string; + requireSignature: boolean; + trustedKeysPath: string | null; + /** Override for tests; production callers use the default `child_process.spawn`. */ + spawnFn?: SpawnFn; +} + +export type VerifyResult = + | {ok: true; reason: 'signature-verified' | 'signature-not-required'} + | {ok: false; reason: 'signature-verification-failed'}; + +/** + * Verify a release tag's GPG signature via `git verify-tag `. + * + * With `requireSignature: false` (default) this is a documented no-op: + * Etherpad's release process does not yet sign tags consistently, and + * forcing verification on by default would break Tier 2 for everyone. + * Admins who run their own builds or who pin to signed forks set + * `updates.requireSignature: true` and import the trusted keys into the + * Etherpad user's keyring (or a dedicated keyring at + * `updates.trustedKeysPath`, which is passed to git via $GNUPGHOME). + */ +export const verifyReleaseTag = async (args: VerifyArgs): Promise => { + if (!args.requireSignature) { + logger.warn( + `verifyReleaseTag: signature check skipped (updates.requireSignature=false) for ${args.tag}`, + ); + return {ok: true, reason: 'signature-not-required'}; + } + const spawnFn = args.spawnFn ?? (realSpawn as unknown as SpawnFn); + const env: NodeJS.ProcessEnv = {...process.env}; + if (args.trustedKeysPath) env.GNUPGHOME = args.trustedKeysPath; + const child = spawnFn('git', ['verify-tag', args.tag], { + cwd: args.repoDir, + env, + stdio: 'ignore', + }); + const code: number | null = await new Promise((resolve) => child.on('close', resolve)); + if (code === 0) return {ok: true, reason: 'signature-verified'}; + logger.error(`verifyReleaseTag: git verify-tag ${args.tag} exited ${code}`); + return {ok: false, reason: 'signature-verification-failed'}; +}; diff --git a/src/tests/backend-new/specs/updater/trustedKeys.test.ts b/src/tests/backend-new/specs/updater/trustedKeys.test.ts new file mode 100644 index 00000000000..56d49471fea --- /dev/null +++ b/src/tests/backend-new/specs/updater/trustedKeys.test.ts @@ -0,0 +1,83 @@ +import {describe, it, expect, vi} from 'vitest'; +import {verifyReleaseTag} from '../../../../node/updater/trustedKeys'; + +const fakeChild = (exitCode: number) => ({ + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(exitCode)); }, +}); + +describe('verifyReleaseTag', () => { + it('returns ok with reason "signature-not-required" when requireSignature is false (no spawn)', async () => { + const spawnFn = vi.fn(); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', + repoDir: '/tmp/x', + requireSignature: false, + trustedKeysPath: null, + spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: true, reason: 'signature-not-required'}); + expect(spawnFn).not.toHaveBeenCalled(); + }); + + it('returns ok on git verify-tag exit 0', async () => { + const spawnFn = vi.fn(() => fakeChild(0)); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', + repoDir: '/tmp/x', + requireSignature: true, + trustedKeysPath: null, + spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: true, reason: 'signature-verified'}); + expect(spawnFn).toHaveBeenCalledWith( + 'git', + ['verify-tag', 'v2.7.3'], + expect.objectContaining({cwd: '/tmp/x'}), + ); + }); + + it('returns failure on non-zero exit', async () => { + const spawnFn = vi.fn(() => fakeChild(1)); + const r = await verifyReleaseTag({ + tag: 'v2.7.3', + repoDir: '/tmp/x', + requireSignature: true, + trustedKeysPath: null, + spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); + }); + + it('passes GNUPGHOME when trustedKeysPath is set', async () => { + const calls: any[] = []; + const spawnFn = vi.fn((cmd: string, args: string[], opts: any) => { + calls.push({cmd, args, env: opts.env}); + return fakeChild(0); + }); + await verifyReleaseTag({ + tag: 'v2.7.3', + repoDir: '/tmp/x', + requireSignature: true, + trustedKeysPath: '/srv/etherpad/keys', + spawnFn: spawnFn as any, + }); + expect(calls[0].env.GNUPGHOME).toBe('/srv/etherpad/keys'); + }); + + it('does not set GNUPGHOME when trustedKeysPath is null', async () => { + const calls: any[] = []; + const spawnFn = vi.fn((cmd: string, args: string[], opts: any) => { + calls.push({cmd, args, env: opts.env}); + return fakeChild(0); + }); + delete process.env.GNUPGHOME; + await verifyReleaseTag({ + tag: 'v2.7.3', + repoDir: '/tmp/x', + requireSignature: true, + trustedKeysPath: null, + spawnFn: spawnFn as any, + }); + expect(calls[0].env.GNUPGHOME).toBeUndefined(); + }); +}); From 658e85d8c41aecd0c9021e390e3bc7220dc5402e Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:04:43 +0100 Subject: [PATCH 05/21] feat(updater): preflight check pipeline for Tier 2 Pure orchestrator over injected probes for install-method, working tree, disk space, pnpm presence, lock state, remote tag existence and signature verification. Cheap-and-definitive checks run first; first failure short-circuits with a typed reason that the route layer will surface in the preflight-failed admin banner. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/preflight.ts | 54 +++++++++++++ .../specs/updater/preflight.test.ts | 77 +++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 src/node/updater/preflight.ts create mode 100644 src/tests/backend-new/specs/updater/preflight.test.ts diff --git a/src/node/updater/preflight.ts b/src/node/updater/preflight.ts new file mode 100644 index 00000000000..f0403e186b6 --- /dev/null +++ b/src/node/updater/preflight.ts @@ -0,0 +1,54 @@ +import {InstallMethod} from './types'; +import type {VerifyResult} from './trustedKeys'; + +export type PreflightReason = + | 'install-method-not-writable' + | 'dirty-working-tree' + | 'low-disk-space' + | 'pnpm-not-found' + | 'lock-held' + | 'remote-tag-missing' + | 'signature-verification-failed'; + +export interface PreflightInput { + targetTag: string; + diskSpaceMinMB: number; + requireSignature: boolean; + trustedKeysPath: string | null; +} + +export interface PreflightDeps { + installMethod: Exclude; + workingTreeClean: () => Promise; + freeDiskMB: () => Promise; + pnpmOnPath: () => Promise; + lockHeld: () => Promise; + remoteHasTag: (tag: string) => Promise; + verifyTag: () => Promise; +} + +export type PreflightResult = {ok: true} | {ok: false; reason: PreflightReason}; + +const WRITABLE_METHODS: ReadonlySet> = new Set(['git']); + +/** + * Sequenced preflight: each check is fast and reads the world. Order matters — + * cheap, definitive failures (install method) run before slow ones (network + * tag lookup, gpg). The first failure short-circuits. + */ +export const runPreflight = async ( + input: PreflightInput, + deps: PreflightDeps, +): Promise => { + if (!WRITABLE_METHODS.has(deps.installMethod)) { + return {ok: false, reason: 'install-method-not-writable'}; + } + if (!await deps.workingTreeClean()) return {ok: false, reason: 'dirty-working-tree'}; + if ((await deps.freeDiskMB()) < input.diskSpaceMinMB) return {ok: false, reason: 'low-disk-space'}; + if (!await deps.pnpmOnPath()) return {ok: false, reason: 'pnpm-not-found'}; + if (await deps.lockHeld()) return {ok: false, reason: 'lock-held'}; + if (!await deps.remoteHasTag(input.targetTag)) return {ok: false, reason: 'remote-tag-missing'}; + const sig = await deps.verifyTag(); + if (!sig.ok) return {ok: false, reason: 'signature-verification-failed'}; + return {ok: true}; +}; diff --git a/src/tests/backend-new/specs/updater/preflight.test.ts b/src/tests/backend-new/specs/updater/preflight.test.ts new file mode 100644 index 00000000000..6ea44e24504 --- /dev/null +++ b/src/tests/backend-new/specs/updater/preflight.test.ts @@ -0,0 +1,77 @@ +import {describe, it, expect, vi} from 'vitest'; +import {runPreflight, PreflightDeps} from '../../../../node/updater/preflight'; + +const baseDeps = (): PreflightDeps => ({ + installMethod: 'git', + workingTreeClean: vi.fn(async () => true), + freeDiskMB: vi.fn(async () => 5000), + pnpmOnPath: vi.fn(async () => true), + lockHeld: vi.fn(async () => false), + remoteHasTag: vi.fn(async () => true), + verifyTag: vi.fn(async () => ({ok: true, reason: 'signature-not-required'})), +}); + +const baseInput = { + targetTag: 'v2.7.3', + diskSpaceMinMB: 500, + requireSignature: false, + trustedKeysPath: null as string | null, +}; + +describe('runPreflight', () => { + it('passes when all checks pass', async () => { + const r = await runPreflight(baseInput, baseDeps()); + expect(r).toEqual({ok: true}); + }); + + it('rejects non-writable install methods', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), installMethod: 'docker'}); + expect(r).toEqual({ok: false, reason: 'install-method-not-writable'}); + }); + + it('rejects npm install method too (not yet writable)', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), installMethod: 'npm'}); + expect(r).toEqual({ok: false, reason: 'install-method-not-writable'}); + }); + + it('rejects a dirty working tree', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), workingTreeClean: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'dirty-working-tree'}); + }); + + it('rejects insufficient disk space', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), freeDiskMB: vi.fn(async () => 100)}); + expect(r).toEqual({ok: false, reason: 'low-disk-space'}); + }); + + it('rejects when pnpm is missing', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), pnpmOnPath: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'pnpm-not-found'}); + }); + + it('rejects when the lock is held', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), lockHeld: vi.fn(async () => true)}); + expect(r).toEqual({ok: false, reason: 'lock-held'}); + }); + + it('rejects when the remote tag is missing', async () => { + const r = await runPreflight(baseInput, {...baseDeps(), remoteHasTag: vi.fn(async () => false)}); + expect(r).toEqual({ok: false, reason: 'remote-tag-missing'}); + }); + + it('rejects when signature verification fails', async () => { + const r = await runPreflight(baseInput, { + ...baseDeps(), + verifyTag: vi.fn(async () => ({ok: false, reason: 'signature-verification-failed'})), + }); + expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); + }); + + it('cheap-check failures short-circuit before slow checks', async () => { + const deps = {...baseDeps(), installMethod: 'docker' as const, + remoteHasTag: vi.fn(async () => true)}; + const r = await runPreflight(baseInput, deps); + expect(r.ok).toBe(false); + expect(deps.remoteHasTag).not.toHaveBeenCalled(); + }); +}); From 90b69e9782d1e1c1fb9d24374e39db95e52ba835 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:06:06 +0100 Subject: [PATCH 06/21] feat(updater): rolling update.log helpers (appendLine + tailLines) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Direct file-append + size-based rotation rather than a log4js appender — avoids re-configuring log4js on top of the user's existing logconfig. appendLine creates parents, rotates at 10MB (configurable), keeps 5 backups by default. tailLines reads the last N lines for /admin/update/log. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/updateLog.ts | 63 +++++++++++ .../specs/updater/updateLog.test.ts | 102 ++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 src/node/updater/updateLog.ts create mode 100644 src/tests/backend-new/specs/updater/updateLog.test.ts diff --git a/src/node/updater/updateLog.ts b/src/node/updater/updateLog.ts new file mode 100644 index 00000000000..386942beb1a --- /dev/null +++ b/src/node/updater/updateLog.ts @@ -0,0 +1,63 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +const DEFAULT_MAX_BYTES = 10 * 1024 * 1024; +const DEFAULT_BACKUPS = 5; + +/** + * Rotate `` when it exceeds `maxBytes`: + * .{n-1} -> .n (oldest dropped) + * -> .1 + * No-op when the file is missing or under the limit. + */ +export const rotateIfNeeded = async ( + logPath: string, + maxBytes = DEFAULT_MAX_BYTES, + backups = DEFAULT_BACKUPS, +): Promise => { + let size = 0; + try { size = (await fs.stat(logPath)).size; } catch (err: any) { + if (err.code === 'ENOENT') return; + throw err; + } + if (size < maxBytes) return; + + // Drop the oldest. Walk from highest index down so the rename chain lands cleanly. + for (let i = backups - 1; i >= 1; i--) { + const src = `${logPath}.${i}`; + const dst = `${logPath}.${i + 1}`; + try { await fs.rename(src, dst); } + catch (err: any) { if (err.code !== 'ENOENT') throw err; } + } + // Current file becomes .1. + try { await fs.rename(logPath, `${logPath}.1`); } + catch (err: any) { if (err.code !== 'ENOENT') throw err; } +}; + +/** + * Append `line` to ``, rotating first if the file is over the size cap. + * Creates parent directories as needed. The line is newline-terminated; do not + * include a trailing newline in `line`. + */ +export const appendLine = async ( + logPath: string, + line: string, + maxBytes = DEFAULT_MAX_BYTES, + backups = DEFAULT_BACKUPS, +): Promise => { + await fs.mkdir(path.dirname(logPath), {recursive: true}); + await rotateIfNeeded(logPath, maxBytes, backups); + await fs.appendFile(logPath, `${line}\n`); +}; + +/** Read the last `n` newline-separated lines from the active log file. Empty array if missing. */ +export const tailLines = async (logPath: string, n: number): Promise => { + if (n <= 0) return []; + let raw: string; + try { raw = await fs.readFile(logPath, 'utf8'); } + catch (err: any) { if (err.code === 'ENOENT') return []; throw err; } + const stripped = raw.endsWith('\n') ? raw.slice(0, -1) : raw; + if (stripped.length === 0) return []; + const all = stripped.split('\n'); + return all.slice(Math.max(0, all.length - n)); +}; diff --git a/src/tests/backend-new/specs/updater/updateLog.test.ts b/src/tests/backend-new/specs/updater/updateLog.test.ts new file mode 100644 index 00000000000..f177a1bddd2 --- /dev/null +++ b/src/tests/backend-new/specs/updater/updateLog.test.ts @@ -0,0 +1,102 @@ +import {describe, it, expect, beforeEach, afterEach} from 'vitest'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import {tailLines} from '../../../../node/updater/updateLog'; + +describe('tailLines', () => { + let dir: string; + let logPath: string; + + beforeEach(async () => { + dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-log-')); + logPath = path.join(dir, 'update.log'); + }); + + afterEach(async () => { + await fs.rm(dir, {recursive: true, force: true}); + }); + + it('returns [] when file is missing', async () => { + expect(await tailLines(logPath, 10)).toEqual([]); + }); + + it('returns [] for an empty file', async () => { + await fs.writeFile(logPath, ''); + expect(await tailLines(logPath, 10)).toEqual([]); + }); + + it('returns up to N lines when file is shorter', async () => { + await fs.writeFile(logPath, 'a\nb\nc\n'); + expect(await tailLines(logPath, 10)).toEqual(['a', 'b', 'c']); + }); + + it('returns the last N when file is longer', async () => { + const lines = Array.from({length: 500}, (_, i) => `line-${i}`); + await fs.writeFile(logPath, lines.join('\n') + '\n'); + expect(await tailLines(logPath, 5)).toEqual([ + 'line-495', 'line-496', 'line-497', 'line-498', 'line-499', + ]); + }); + + it('handles a final-line-without-newline', async () => { + await fs.writeFile(logPath, 'a\nb\nc'); + expect(await tailLines(logPath, 10)).toEqual(['a', 'b', 'c']); + }); + + it('handles n=0', async () => { + await fs.writeFile(logPath, 'a\nb\nc\n'); + expect(await tailLines(logPath, 0)).toEqual([]); + }); +}); + +describe('appendLine + rotation', () => { + let dir: string; + let logPath: string; + + beforeEach(async () => { + dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-log-')); + logPath = path.join(dir, 'update.log'); + }); + afterEach(async () => { await fs.rm(dir, {recursive: true, force: true}); }); + + it('appendLine creates parent dir and writes a newline-terminated line', async () => { + const {appendLine} = await import('../../../../node/updater/updateLog'); + const nested = path.join(dir, 'a', 'b', 'update.log'); + await appendLine(nested, 'hello world'); + expect(await fs.readFile(nested, 'utf8')).toBe('hello world\n'); + }); + + it('rotateIfNeeded shifts .1 -> .2, current -> .1 once over the size threshold', async () => { + const {rotateIfNeeded} = await import('../../../../node/updater/updateLog'); + // Force rotation by passing a tiny limit; write a line above the limit. + await fs.writeFile(logPath, 'a'.repeat(50)); + await rotateIfNeeded(logPath, 10, 3); + expect(await fs.readFile(`${logPath}.1`, 'utf8')).toBe('a'.repeat(50)); + // Original file is gone (or empty after rotation). + let exists = true; + try { await fs.access(logPath); } catch { exists = false; } + expect(exists).toBe(false); + }); + + it('rotateIfNeeded preserves up to BACKUPS-1 older backups', async () => { + const {rotateIfNeeded} = await import('../../../../node/updater/updateLog'); + await fs.writeFile(logPath, 'newest'.repeat(20)); + await fs.writeFile(`${logPath}.1`, 'older-1'); + await fs.writeFile(`${logPath}.2`, 'older-2'); + await rotateIfNeeded(logPath, 10, 3); + expect(await fs.readFile(`${logPath}.1`, 'utf8')).toBe('newest'.repeat(20)); + expect(await fs.readFile(`${logPath}.2`, 'utf8')).toBe('older-1'); + expect(await fs.readFile(`${logPath}.3`, 'utf8')).toBe('older-2'); + }); + + it('rotateIfNeeded is a no-op when under the limit', async () => { + const {rotateIfNeeded} = await import('../../../../node/updater/updateLog'); + await fs.writeFile(logPath, 'small'); + await rotateIfNeeded(logPath, 10 * 1024 * 1024, 3); + expect(await fs.readFile(logPath, 'utf8')).toBe('small'); + let backupExists = true; + try { await fs.access(`${logPath}.1`); } catch { backupExists = false; } + expect(backupExists).toBe(false); + }); +}); From 3f03472c26051d19b5c5c7a515a8336e14678632 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:08:10 +0100 Subject: [PATCH 07/21] feat(updater): SessionDrainer + handshake guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drainer schedules T-60 / -30 / -10 broadcasts and resolves at T=0; isAcceptingConnections() flips off for the duration. PadMessageHandler consults the flag at the start of CLIENT_READY and disconnects new joiners with reason "updateInProgress" — existing sockets are unaffected. Drains shorter than 30s collapse the early timers to fire ASAP rather than queue past the drain end. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/handler/PadMessageHandler.ts | 9 ++ src/node/updater/SessionDrainer.ts | 79 +++++++++++++++++ .../specs/updater/SessionDrainer.test.ts | 85 +++++++++++++++++++ .../specs/updater/preflight.test.ts | 5 +- 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 src/node/updater/SessionDrainer.ts create mode 100644 src/tests/backend-new/specs/updater/SessionDrainer.test.ts diff --git a/src/node/handler/PadMessageHandler.ts b/src/node/handler/PadMessageHandler.ts index 65ac9d7626d..3a77c972ba4 100644 --- a/src/node/handler/PadMessageHandler.ts +++ b/src/node/handler/PadMessageHandler.ts @@ -37,6 +37,7 @@ import settings, { sofficeAvailable } from '../utils/Settings'; import {anonymizeIp} from '../utils/anonymizeIp'; +import {isAcceptingConnections} from '../updater/SessionDrainer'; const logIp = (ip: string | null | undefined) => anonymizeIp(ip, settings.ipLogging); const securityManager = require('../db/SecurityManager'); const plugins = require('../../static/js/pluginfw/plugin_defs'); @@ -377,6 +378,14 @@ exports.handleMessage = async (socket:any, message: ClientVarMessage) => { if (!thisSession) throw new Error('message from an unknown connection'); if (message.type === 'CLIENT_READY') { + // Refuse new joiners while the updater drainer is running. Existing sockets + // are unaffected — only the initial CLIENT_READY handshake is gated. The + // pad UI will show the drain announcement separately via shoutMessage. + if (!isAcceptingConnections()) { + socket.json.send({disconnect: 'updateInProgress'}); + socket.disconnect(true); + return; + } // Prefer the HttpOnly author-token cookie over the in-message token (GDPR // PR3). Legacy clients (pre-PR3 browsers or API consumers) still send // `token` in the CLIENT_READY payload — honour it one more release, warn diff --git a/src/node/updater/SessionDrainer.ts b/src/node/updater/SessionDrainer.ts new file mode 100644 index 00000000000..acddc512e61 --- /dev/null +++ b/src/node/updater/SessionDrainer.ts @@ -0,0 +1,79 @@ +/** + * Coordinates the pre-restart drain: refuses new pad connections, broadcasts + * "system message" announcements at T-60 / T-30 / T-10, and resolves at T=0 + * so the executor can take over. + * + * Per docs/superpowers/specs/2026-04-25-auto-update-design.md (section + * "Active sessions"). 60s default; configurable via `updates.drainSeconds`. + */ + +let acceptingConnections = true; + +export const isAcceptingConnections = (): boolean => acceptingConnections; + +/** Test-only: reset the module-level flag between tests. */ +export const _resetForTests = (): void => { acceptingConnections = true; }; + +export type DrainBroadcastKey = + | 'update.drain.t60' + | 'update.drain.t30' + | 'update.drain.t10'; + +export interface DrainerOpts { + drainSeconds: number; + /** Called for every announcement; values carries timing data the i18n string can interpolate. */ + broadcast: (i18nKey: DrainBroadcastKey, values: Record) => void; +} + +export interface Drainer { + start: () => Promise<{outcome: 'completed' | 'cancelled'}>; + cancel: () => void; +} + +export const createDrainer = ({drainSeconds, broadcast}: DrainerOpts): Drainer => { + const timers: NodeJS.Timeout[] = []; + let resolveDone: ((r: {outcome: 'completed' | 'cancelled'}) => void) | null = null; + let cancelled = false; + let started = false; + + const fire = (key: DrainBroadcastKey, secondsRemaining: number) => { + if (cancelled) return; + broadcast(key, {seconds: secondsRemaining}); + }; + + const start = (): Promise<{outcome: 'completed' | 'cancelled'}> => { + if (started) return Promise.reject(new Error('drainer already started')); + started = true; + acceptingConnections = false; + return new Promise((resolve) => { + resolveDone = resolve; + const ms = drainSeconds * 1000; + // T-60 announcement fires at start; T-30 and T-10 are scheduled at offsets. + // Drain windows shorter than 30s collapse the early timers to "fire ASAP". + fire('update.drain.t60', drainSeconds); + timers.push(setTimeout(() => fire('update.drain.t30', 30), Math.max(0, ms - 30_000))); + timers.push(setTimeout(() => fire('update.drain.t10', 10), Math.max(0, ms - 10_000))); + timers.push(setTimeout(() => { + if (cancelled) return; + // Don't restore acceptingConnections — the executor is about to exit 75 + // and the supervisor restart will reset module state. Leaving the flag + // off until exit means stragglers can't slip in between drain end and + // exit(). + resolveDone?.({outcome: 'completed'}); + resolveDone = null; + }, ms)); + }); + }; + + const cancel = (): void => { + if (cancelled) return; + cancelled = true; + for (const t of timers) clearTimeout(t); + timers.length = 0; + acceptingConnections = true; + resolveDone?.({outcome: 'cancelled'}); + resolveDone = null; + }; + + return {start, cancel}; +}; diff --git a/src/tests/backend-new/specs/updater/SessionDrainer.test.ts b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts new file mode 100644 index 00000000000..ee5db23c880 --- /dev/null +++ b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts @@ -0,0 +1,85 @@ +import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest'; +import {createDrainer, isAcceptingConnections, _resetForTests} from '../../../../node/updater/SessionDrainer'; + +describe('SessionDrainer', () => { + beforeEach(() => { vi.useFakeTimers(); _resetForTests(); }); + afterEach(() => { vi.useRealTimers(); _resetForTests(); }); + + it('emits T-60, T-30, T-10 in order and resolves at T=0', async () => { + const broadcasts: string[] = []; + const drainer = createDrainer({ + drainSeconds: 60, + broadcast: (key) => { broadcasts.push(key); }, + }); + const done = drainer.start(); + expect(broadcasts).toEqual(['update.drain.t60']); + await vi.advanceTimersByTimeAsync(30_000); + expect(broadcasts).toEqual(['update.drain.t60', 'update.drain.t30']); + await vi.advanceTimersByTimeAsync(20_000); + expect(broadcasts).toEqual(['update.drain.t60', 'update.drain.t30', 'update.drain.t10']); + await vi.advanceTimersByTimeAsync(10_000); + const r = await done; + expect(r).toEqual({outcome: 'completed'}); + }); + + it('flips isAcceptingConnections to false during drain and back on cancel', () => { + const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); + expect(isAcceptingConnections()).toBe(true); + drainer.start(); + expect(isAcceptingConnections()).toBe(false); + drainer.cancel(); + expect(isAcceptingConnections()).toBe(true); + }); + + it('cancel before T=0 resolves start() promise as cancelled', async () => { + const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); + const done = drainer.start(); + await vi.advanceTimersByTimeAsync(20_000); + drainer.cancel(); + const r = await done; + expect(r).toEqual({outcome: 'cancelled'}); + }); + + it('cancel does not fire any further broadcasts', async () => { + const broadcasts: string[] = []; + const drainer = createDrainer({ + drainSeconds: 60, + broadcast: (key) => { broadcasts.push(key); }, + }); + drainer.start(); + expect(broadcasts).toEqual(['update.drain.t60']); + drainer.cancel(); + await vi.advanceTimersByTimeAsync(60_000); + expect(broadcasts).toEqual(['update.drain.t60']); + }); + + it('passes seconds-remaining in broadcast values', async () => { + const seen: Array<{key: string; values: any}> = []; + const drainer = createDrainer({ + drainSeconds: 60, + broadcast: (key, values) => { seen.push({key, values}); }, + }); + drainer.start(); + expect(seen[0]).toEqual({key: 'update.drain.t60', values: {seconds: 60}}); + await vi.advanceTimersByTimeAsync(30_000); + expect(seen[1]).toEqual({key: 'update.drain.t30', values: {seconds: 30}}); + await vi.advanceTimersByTimeAsync(20_000); + expect(seen[2]).toEqual({key: 'update.drain.t10', values: {seconds: 10}}); + }); + + it('drain shorter than 30s skips the t30 broadcast but still emits t10 and completes', async () => { + const broadcasts: string[] = []; + const drainer = createDrainer({ + drainSeconds: 15, + broadcast: (key) => { broadcasts.push(key); }, + }); + const done = drainer.start(); + expect(broadcasts).toEqual(['update.drain.t60']); + // t30 fires at max(0, 15-30)=0 i.e. immediately on next tick. + await vi.advanceTimersByTimeAsync(0); + expect(broadcasts).toContain('update.drain.t30'); + await vi.advanceTimersByTimeAsync(15_000); + await done; + expect(broadcasts.at(-1)).toBe('update.drain.t10'); + }); +}); diff --git a/src/tests/backend-new/specs/updater/preflight.test.ts b/src/tests/backend-new/specs/updater/preflight.test.ts index 6ea44e24504..5926c7864bd 100644 --- a/src/tests/backend-new/specs/updater/preflight.test.ts +++ b/src/tests/backend-new/specs/updater/preflight.test.ts @@ -1,5 +1,6 @@ import {describe, it, expect, vi} from 'vitest'; import {runPreflight, PreflightDeps} from '../../../../node/updater/preflight'; +import type {VerifyResult} from '../../../../node/updater/trustedKeys'; const baseDeps = (): PreflightDeps => ({ installMethod: 'git', @@ -8,7 +9,7 @@ const baseDeps = (): PreflightDeps => ({ pnpmOnPath: vi.fn(async () => true), lockHeld: vi.fn(async () => false), remoteHasTag: vi.fn(async () => true), - verifyTag: vi.fn(async () => ({ok: true, reason: 'signature-not-required'})), + verifyTag: vi.fn(async (): Promise => ({ok: true, reason: 'signature-not-required'})), }); const baseInput = { @@ -62,7 +63,7 @@ describe('runPreflight', () => { it('rejects when signature verification fails', async () => { const r = await runPreflight(baseInput, { ...baseDeps(), - verifyTag: vi.fn(async () => ({ok: false, reason: 'signature-verification-failed'})), + verifyTag: vi.fn(async (): Promise => ({ok: false, reason: 'signature-verification-failed'})), }); expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); }); From 88a99c056a8244c52233ead41fe1d4044b5bb9ce Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:10:50 +0100 Subject: [PATCH 08/21] =?UTF-8?q?feat(updater):=20UpdateExecutor=20?= =?UTF-8?q?=E2=80=94=20snapshot,=20fetch/checkout/install/build,=20exit=20?= =?UTF-8?q?75?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure-DI orchestrator: spawnFn, copyFile, readSha, saveState, exit are all injected so unit tests run the full pipeline without spawning real children or mutating the real install. Streams stdout/stderr to update.log via the now-best-effort appendLine helper (swallows fs errors so the executor itself never breaks on read-only / unwritable log dirs). Failure paths transition to rolling-back and return — the route layer hands off to RollbackHandler which owns the rollback exit, so we don't double-exit and lose tail lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/UpdateExecutor.ts | 152 ++++++++++++++++++ src/node/updater/updateLog.ts | 19 +++ .../specs/updater/UpdateExecutor.test.ts | 145 +++++++++++++++++ .../specs/updater/updateLog.test.ts | 10 ++ 4 files changed, 326 insertions(+) create mode 100644 src/node/updater/UpdateExecutor.ts create mode 100644 src/tests/backend-new/specs/updater/UpdateExecutor.test.ts diff --git a/src/node/updater/UpdateExecutor.ts b/src/node/updater/UpdateExecutor.ts new file mode 100644 index 00000000000..85229ed4f91 --- /dev/null +++ b/src/node/updater/UpdateExecutor.ts @@ -0,0 +1,152 @@ +import path from 'node:path'; +import log4js from 'log4js'; +import {SpawnOptions} from 'node:child_process'; +import {UpdateState} from './types'; +import {appendLine} from './updateLog'; + +const logger = log4js.getLogger('updater'); + +export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { + stdout: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; + stderr: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; + on: (event: 'close', cb: (code: number | null) => void) => void; +}; + +export interface ExecutorDeps { + /** Path of the on-disk Etherpad install (the git working tree). */ + repoDir: string; + /** Where pnpm-lock.yaml + sha info gets backed up. */ + backupDir: string; + /** Injected child_process.spawn so tests can drive the pipeline deterministically. */ + spawnFn: SpawnFn; + /** Returns the current HEAD SHA. Production callers wrap `git rev-parse HEAD`. */ + readSha: () => Promise; + /** Plain file copy. Production callers use fs.copyFile (with mkdir-p of parent). */ + copyFile: (src: string, dst: string) => Promise; + /** Persist the in-flight UpdateState. Production callers use saveState(stateFilePath()). */ + saveState: (s: UpdateState) => Promise; + /** State as it was when Apply was clicked — preserves Tier 1 fields (latest, email, etc.). */ + initialState: UpdateState; + /** Tag to update to. */ + targetTag: string; + /** Clock injection for deterministic timestamps in tests. */ + now: () => Date; + /** process.exit injection so tests can assert exit code without actually exiting. */ + exit: (code: number) => void; +} + +export type ExecutorResult = + | {outcome: 'pending-verification'} + | {outcome: 'failed-install'; reason: string} + | {outcome: 'failed-build'; reason: string} + | {outcome: 'failed-checkout'; reason: string}; + +const runStep = ( + spawnFn: SpawnFn, + repoDir: string, + logPath: string, + cmd: string, + args: string[], +): Promise<{code: number | null; stderr: string}> => new Promise((resolve) => { + let stderr = ''; + const child = spawnFn(cmd, args, {cwd: repoDir, stdio: ['ignore', 'pipe', 'pipe']}); + const tag = `${cmd} ${args.join(' ')}`; + child.stdout.on('data', (chunk: Buffer) => { + const txt = chunk.toString().trimEnd(); + logger.info(`[${tag}] ${txt}`); + appendLine(logPath, `[${new Date().toISOString()}] ${tag} | ${txt}`); + }); + child.stderr.on('data', (chunk: Buffer) => { + const txt = chunk.toString(); + stderr += txt; + const trimmed = txt.trimEnd(); + logger.warn(`[${tag}] ${trimmed}`); + appendLine(logPath, `[${new Date().toISOString()}] ${tag} ERR | ${trimmed}`); + }); + child.on('close', (code) => resolve({code, stderr})); +}); + +/** + * Run the update pipeline. Each transition writes state before/after so a hard + * kill mid-step lands the next boot in a known state for RollbackHandler. + * + * On install/build/checkout failure the executor transitions to `rolling-back`, + * persists, and returns. The route layer then runs RollbackHandler.performRollback. + * The executor does NOT call `exit` on failure paths — the rollback path owns + * that exit so we don't double-exit and lose log lines. + */ +export const executeUpdate = async (deps: ExecutorDeps): Promise => { + const fromSha = await deps.readSha(); + const logPath = path.join(deps.repoDir, 'var', 'log', 'update.log'); + + let s: UpdateState = { + ...deps.initialState, + execution: { + status: 'executing', + targetTag: deps.targetTag, + fromSha, + startedAt: deps.now().toISOString(), + }, + bootCount: 0, + }; + await deps.saveState(s); + + // Snapshot lockfile (SHA already captured above; the rollback handler reads + // execution.fromSha rather than a separate file so a successful rollback + // doesn't depend on /var staying writable past this point). + await deps.copyFile( + path.join(deps.repoDir, 'pnpm-lock.yaml'), + path.join(deps.backupDir, 'pnpm-lock.yaml'), + ); + + const fail = async ( + outcome: 'failed-install' | 'failed-build' | 'failed-checkout', + reason: string, + ): Promise => { + s = { + ...s, + execution: { + status: 'rolling-back', + reason, + targetTag: deps.targetTag, + fromSha, + at: deps.now().toISOString(), + }, + }; + await deps.saveState(s); + logger.error(`update step failed (${outcome}): ${reason}`); + appendLine(logPath, `[${deps.now().toISOString()}] FAIL ${outcome}: ${reason}`); + return {outcome, reason}; + }; + + let r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['fetch', '--tags', 'origin']); + if (r.code !== 0) return fail('failed-checkout', `git fetch exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', deps.targetTag]); + if (r.code !== 0) return fail('failed-checkout', `git checkout exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); + if (r.code !== 0) return fail('failed-install', `pnpm install exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['run', 'build:ui']); + if (r.code !== 0) return fail('failed-build', `pnpm run build:ui exit ${r.code}: ${r.stderr.trim()}`); + + // pending-verification: the next boot's RollbackHandler arms the health-check timer. + s = { + ...s, + execution: { + status: 'pending-verification', + targetTag: deps.targetTag, + fromSha, + // Real deadline is computed at next boot using rollbackHealthCheckSeconds. + // We persist a placeholder here purely so the field is present. + deadlineAt: deps.now().toISOString(), + }, + bootCount: 0, + }; + await deps.saveState(s); + logger.info(`update executed: ${fromSha} -> ${deps.targetTag}; exiting 75 for supervisor restart`); + void appendLine(logPath, `[${deps.now().toISOString()}] OK pending-verification ${fromSha} -> ${deps.targetTag}; exiting 75`); + deps.exit(75); + return {outcome: 'pending-verification'}; +}; diff --git a/src/node/updater/updateLog.ts b/src/node/updater/updateLog.ts index 386942beb1a..f0c3b58b0f6 100644 --- a/src/node/updater/updateLog.ts +++ b/src/node/updater/updateLog.ts @@ -38,12 +38,31 @@ export const rotateIfNeeded = async ( * Append `line` to ``, rotating first if the file is over the size cap. * Creates parent directories as needed. The line is newline-terminated; do not * include a trailing newline in `line`. + * + * Best-effort: swallows fs errors silently. Update logging must never break the + * update flow itself, and errors are already surfaced via log4js by callers. */ export const appendLine = async ( logPath: string, line: string, maxBytes = DEFAULT_MAX_BYTES, backups = DEFAULT_BACKUPS, +): Promise => { + try { + await fs.mkdir(path.dirname(logPath), {recursive: true}); + await rotateIfNeeded(logPath, maxBytes, backups); + await fs.appendFile(logPath, `${line}\n`); + } catch { + // ignore — caller is fire-and-forget logging + } +}; + +/** Same as appendLine but throws on error — used by tests that want to assert disk failures surface. */ +export const appendLineStrict = async ( + logPath: string, + line: string, + maxBytes = DEFAULT_MAX_BYTES, + backups = DEFAULT_BACKUPS, ): Promise => { await fs.mkdir(path.dirname(logPath), {recursive: true}); await rotateIfNeeded(logPath, maxBytes, backups); diff --git a/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts b/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts new file mode 100644 index 00000000000..29a4374fa5c --- /dev/null +++ b/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts @@ -0,0 +1,145 @@ +import {describe, it, expect, vi, beforeEach} from 'vitest'; +import {executeUpdate, ExecutorDeps} from '../../../../node/updater/UpdateExecutor'; +import {EMPTY_STATE, UpdateState} from '../../../../node/updater/types'; + +interface ScriptStep {cmd: string; exit: number; stderr?: string} + +const okSpawn = (script: ScriptStep[]) => { + let i = 0; + return vi.fn((cmd: string, args: string[]) => { + const step = script[i++]; + if (!step) throw new Error(`Unexpected spawn call: ${cmd} ${args.join(' ')}`); + const expected = step.cmd; + const actual = `${cmd} ${args.join(' ')}`; + if (expected !== actual) { + throw new Error(`Spawn order mismatch: expected "${expected}", got "${actual}"`); + } + return { + stdout: {on: () => {}}, + stderr: {on: (e: string, cb: any) => { if (e === 'data' && step.stderr) cb(Buffer.from(step.stderr)); }}, + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(step.exit)); }, + }; + }); +}; + +const baseDeps = (): { + deps: ExecutorDeps; + states: UpdateState[]; + copies: Array<{src: string; dst: string}>; + exitedWith: {code: number | null}; + fromShaUsed: {value: string | null}; +} => { + const states: UpdateState[] = []; + const copies: Array<{src: string; dst: string}> = []; + const exitedWith = {code: null as number | null}; + const fromShaUsed = {value: null as string | null}; + return { + deps: { + repoDir: '/srv/etherpad', + backupDir: '/srv/etherpad/var/update-backup', + spawnFn: okSpawn([ + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 0}, + {cmd: 'pnpm run build:ui', exit: 0}, + ]) as any, + readSha: vi.fn(async () => { fromShaUsed.value = 'abc123'; return 'abc123'; }), + copyFile: vi.fn(async (src: string, dst: string) => { copies.push({src, dst}); }), + saveState: vi.fn(async (s: UpdateState) => { states.push(structuredClone(s)); }), + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v2.7.3', + now: () => new Date('2026-05-08T10:00:00Z'), + exit: (code: number) => { exitedWith.code = code; }, + }, + states, + copies, + exitedWith, + fromShaUsed, + }; +}; + +describe('executeUpdate', () => { + it('happy path: snapshots, runs steps, persists pending-verification, exits 75', async () => { + const {deps, states, copies, exitedWith} = baseDeps(); + const r = await executeUpdate(deps); + expect(r).toEqual({outcome: 'pending-verification'}); + expect(copies).toEqual([ + {src: '/srv/etherpad/pnpm-lock.yaml', dst: '/srv/etherpad/var/update-backup/pnpm-lock.yaml'}, + ]); + expect(states.at(-1)?.execution.status).toBe('pending-verification'); + expect((states.at(-1)?.execution as any).fromSha).toBe('abc123'); + expect(states.at(-1)?.bootCount).toBe(0); + expect(exitedWith.code).toBe(75); + }); + + it('records the executing -> pending-verification transition in saveState calls', async () => { + const {deps, states} = baseDeps(); + await executeUpdate(deps); + const statuses = states.map((s) => s.execution.status); + expect(statuses[0]).toBe('executing'); + expect(statuses.at(-1)).toBe('pending-verification'); + }); + + it('install failure flips state to rolling-back without exiting', async () => { + const {deps, states, exitedWith} = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 1, stderr: 'resolver bork'}, + ]) as any; + const r = await executeUpdate(deps); + expect(r.outcome).toBe('failed-install'); + expect(states.at(-1)?.execution.status).toBe('rolling-back'); + expect((states.at(-1)?.execution as any).reason).toContain('pnpm install exit 1'); + expect(exitedWith.code).toBeNull(); // executor must not exit on failure paths + }); + + it('build failure flips state to rolling-back', async () => { + const {deps, states, exitedWith} = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 0}, + {cmd: 'pnpm run build:ui', exit: 2, stderr: 'tsc bork'}, + ]) as any; + const r = await executeUpdate(deps); + expect(r.outcome).toBe('failed-build'); + expect(states.at(-1)?.execution.status).toBe('rolling-back'); + expect(exitedWith.code).toBeNull(); + }); + + it('checkout failure flips state to rolling-back (no copyFile? actually copies first)', async () => { + // copyFile is called before any spawn; checkout is the second spawn so by then the + // backup lockfile is in place. This matters: rollback needs the backup to exist. + const {deps, copies, states} = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 1, stderr: 'conflict'}, + ]) as any; + const r = await executeUpdate(deps); + expect(r.outcome).toBe('failed-checkout'); + expect(copies.length).toBe(1); // backup taken before any mutation + expect(states.at(-1)?.execution.status).toBe('rolling-back'); + }); + + it('git-fetch failure flips state to rolling-back', async () => { + const {deps, states} = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git fetch --tags origin', exit: 128, stderr: 'cannot reach origin'}, + ]) as any; + const r = await executeUpdate(deps); + expect(r.outcome).toBe('failed-checkout'); + expect(states.at(-1)?.execution.status).toBe('rolling-back'); + }); + + it('captures fromSha into the rolling-back state so RollbackHandler can restore it', async () => { + const {deps, states} = baseDeps(); + deps.spawnFn = okSpawn([ + {cmd: 'git fetch --tags origin', exit: 0}, + {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'pnpm install --frozen-lockfile', exit: 1}, + ]) as any; + await executeUpdate(deps); + expect((states.at(-1)?.execution as any).fromSha).toBe('abc123'); + }); +}); diff --git a/src/tests/backend-new/specs/updater/updateLog.test.ts b/src/tests/backend-new/specs/updater/updateLog.test.ts index f177a1bddd2..ccb17a537ab 100644 --- a/src/tests/backend-new/specs/updater/updateLog.test.ts +++ b/src/tests/backend-new/specs/updater/updateLog.test.ts @@ -67,6 +67,16 @@ describe('appendLine + rotation', () => { expect(await fs.readFile(nested, 'utf8')).toBe('hello world\n'); }); + it('appendLine swallows errors so the caller never breaks on a read-only fs', async () => { + const {appendLine} = await import('../../../../node/updater/updateLog'); + // Make the would-be parent dir a regular file — fs.mkdir then fails with ENOTDIR + // (or EEXIST depending on platform), which the helper must swallow. + const collide = path.join(dir, 'not-a-dir'); + await fs.writeFile(collide, 'oops'); + const target = path.join(collide, 'inner', 'update.log'); + await appendLine(target, 'x'); // must NOT throw + }); + it('rotateIfNeeded shifts .1 -> .2, current -> .1 once over the size threshold', async () => { const {rotateIfNeeded} = await import('../../../../node/updater/updateLog'); // Force rotation by passing a tiny limit; write a line above the limit. From 46e68f375f85a7005137b5bfbd426822d6c23c97 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:12:20 +0100 Subject: [PATCH 09/21] =?UTF-8?q?feat(updater):=20RollbackHandler=20?= =?UTF-8?q?=E2=80=94=20health-check=20timer=20+=20crash-loop=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit checkPendingVerification arms a 60s timer at boot when state is pending-verification and increments bootCount; bootCount>2 forces an immediate rollback (crash-loop guard). markVerified persists the verified state and stops the timer. performRollback restores the backup lockfile, runs git checkout and pnpm install, lands on rolled-back or rollback-failed (terminal) on sub-step failure, exits 75 either way so the supervisor restart brings the new state up. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/RollbackHandler.ts | 186 ++++++++++++++++ .../specs/updater/RollbackHandler.test.ts | 203 ++++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 src/node/updater/RollbackHandler.ts create mode 100644 src/tests/backend-new/specs/updater/RollbackHandler.test.ts diff --git a/src/node/updater/RollbackHandler.ts b/src/node/updater/RollbackHandler.ts new file mode 100644 index 00000000000..59cb9b6d002 --- /dev/null +++ b/src/node/updater/RollbackHandler.ts @@ -0,0 +1,186 @@ +import path from 'node:path'; +import log4js from 'log4js'; +import {UpdateState} from './types'; +import type {SpawnFn} from './UpdateExecutor'; +import {appendLine} from './updateLog'; + +const logger = log4js.getLogger('updater'); + +export interface RollbackDeps { + /** Path of the on-disk Etherpad install (the git working tree). */ + repoDir: string; + /** Where pnpm-lock.yaml was backed up by the executor. */ + backupDir: string; + spawnFn: SpawnFn; + copyFile: (src: string, dst: string) => Promise; + saveState: (s: UpdateState) => Promise; + exit: (code: number) => void; + now: () => Date; + /** Health-check window after a fresh boot. Default 60s; set via updates.rollbackHealthCheckSeconds. */ + rollbackHealthCheckSeconds: number; +} + +const runStep = ( + spawnFn: SpawnFn, + cwd: string, + logPath: string, + cmd: string, + args: string[], +): Promise => new Promise((resolve) => { + const child = spawnFn(cmd, args, {cwd, stdio: ['ignore', 'pipe', 'pipe']}); + const tag = `${cmd} ${args.join(' ')}`; + child.stdout.on('data', (b: Buffer) => { + const t = b.toString().trimEnd(); + logger.info(`[rollback ${tag}] ${t}`); + appendLine(logPath, `[${new Date().toISOString()}] rollback ${tag} | ${t}`); + }); + child.stderr.on('data', (b: Buffer) => { + const t = b.toString().trimEnd(); + logger.warn(`[rollback ${tag}] ${t}`); + appendLine(logPath, `[${new Date().toISOString()}] rollback ${tag} ERR | ${t}`); + }); + child.on('close', (c) => resolve(c)); +}); + +/** + * Restore the previous SHA + lockfile and exit 75 so the supervisor restarts. + * + * Lands on `rolled-back` on success, `rollback-failed` on any sub-step error. + * Both paths exit 75 — the supervisor restart is what brings the rolled-back + * (or terminal) state up where the admin UI can surface it. Rollback-failed + * disables auto/autonomous tiers globally (see UpdatePolicy) until an admin + * POSTs /admin/update/acknowledge. + */ +export const performRollback = async (state: UpdateState, deps: RollbackDeps): Promise => { + const exec = state.execution; + if (exec.status !== 'rolling-back' && exec.status !== 'pending-verification') { + throw new Error(`performRollback called from unexpected status: ${exec.status}`); + } + const fromSha = (exec as {fromSha: string}).fromSha; + const targetTag = (exec as {targetTag: string}).targetTag; + const reason = exec.status === 'rolling-back' + ? exec.reason + : 'health-check-failed-or-crash-loop'; + const logPath = path.join(deps.repoDir, 'var', 'log', 'update.log'); + + const failTerminal = async (subReason: string): Promise => { + const at = deps.now().toISOString(); + await deps.saveState({ + ...state, + execution: { + status: 'rollback-failed', + reason: `${reason}; rollback also failed: ${subReason}`, + targetTag, + fromSha, + at, + }, + lastResult: { + targetTag, + fromSha, + outcome: 'rollback-failed', + reason: `${reason}; rollback failed: ${subReason}`, + at, + }, + bootCount: 0, + }); + logger.error( + `rollback FAILED: ${subReason}; manual intervention required ` + + '(POST /admin/update/acknowledge after fixing)', + ); + appendLine(logPath, `[${at}] ROLLBACK_FAILED ${subReason}`); + deps.exit(75); + }; + + try { + await deps.copyFile( + path.join(deps.backupDir, 'pnpm-lock.yaml'), + path.join(deps.repoDir, 'pnpm-lock.yaml'), + ); + } catch (err) { + return failTerminal(`copy lockfile: ${(err as Error).message}`); + } + + const checkoutCode = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', fromSha]); + if (checkoutCode !== 0) return failTerminal(`git checkout ${fromSha} exit ${checkoutCode}`); + + const installCode = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); + if (installCode !== 0) return failTerminal(`pnpm install exit ${installCode}`); + + const at = deps.now().toISOString(); + await deps.saveState({ + ...state, + execution: {status: 'rolled-back', reason, targetTag, restoredSha: fromSha, at}, + lastResult: {targetTag, fromSha, outcome: 'rolled-back', reason, at}, + bootCount: 0, + }); + logger.warn(`rolled back to ${fromSha} (reason: ${reason})`); + appendLine(logPath, `[${at}] ROLLED_BACK to ${fromSha}; reason=${reason}; exiting 75`); + deps.exit(75); +}; + +export interface CheckResult { + /** True if a health-check timer was armed and is awaiting markVerified or expiry. */ + armed: boolean; + /** Cancels the timer and transitions to `verified`. No-op when armed is false. */ + markVerified: () => void; +} + +/** + * Inspect the persisted execution state at boot and react: + * - idle / verified / etc.: no-op. + * - pending-verification with bootCount > 2: force rollback (crash-loop guard). + * - pending-verification otherwise: increment bootCount, persist, arm a timer. + */ +export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps): CheckResult => { + const exec = state.execution; + if (exec.status !== 'pending-verification') return {armed: false, markVerified: () => {}}; + + if (state.bootCount > 2) { + // Don't await — fire and forget so the boot sequence proceeds; the rollback + // path will exit 75 asynchronously and the supervisor restarts on the + // restored SHA. + void performRollback(state, deps); + return {armed: false, markVerified: () => {}}; + } + + const incremented: UpdateState = {...state, bootCount: state.bootCount + 1}; + void deps.saveState(incremented); + + let cleared = false; + const timer = setTimeout(() => { + if (cleared) return; + void performRollback({ + ...incremented, + execution: { + status: 'rolling-back', + reason: 'health-check-timeout', + targetTag: exec.targetTag, + fromSha: exec.fromSha, + at: deps.now().toISOString(), + }, + }, deps); + }, deps.rollbackHealthCheckSeconds * 1000); + + return { + armed: true, + markVerified: () => { + if (cleared) return; + cleared = true; + clearTimeout(timer); + const at = deps.now().toISOString(); + void deps.saveState({ + ...incremented, + execution: {status: 'verified', targetTag: exec.targetTag, verifiedAt: at}, + lastResult: { + targetTag: exec.targetTag, + fromSha: exec.fromSha, + outcome: 'verified', + reason: null, + at, + }, + bootCount: 0, + }); + logger.info(`update verified after restart: ${exec.fromSha} -> ${exec.targetTag}`); + }, + }; +}; diff --git a/src/tests/backend-new/specs/updater/RollbackHandler.test.ts b/src/tests/backend-new/specs/updater/RollbackHandler.test.ts new file mode 100644 index 00000000000..8a30cbcb769 --- /dev/null +++ b/src/tests/backend-new/specs/updater/RollbackHandler.test.ts @@ -0,0 +1,203 @@ +import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest'; +import {checkPendingVerification, performRollback, RollbackDeps} from '../../../../node/updater/RollbackHandler'; +import {EMPTY_STATE} from '../../../../node/updater/types'; + +const okSpawn = (exit: number) => vi.fn(() => ({ + stdout: {on: () => {}}, + stderr: {on: () => {}}, + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(exit)); }, +})) as any; + +const baseDeps = (): RollbackDeps => ({ + repoDir: '/srv/etherpad', + backupDir: '/srv/etherpad/var/update-backup', + spawnFn: okSpawn(0), + copyFile: vi.fn(async () => {}), + saveState: vi.fn(async () => {}), + exit: vi.fn(), + now: () => new Date('2026-05-08T10:00:00Z'), + rollbackHealthCheckSeconds: 60, +}); + +describe('checkPendingVerification', () => { + beforeEach(() => { vi.useFakeTimers(); }); + afterEach(() => { vi.useRealTimers(); }); + + it('idle state is a no-op (timer is not armed)', () => { + const r = checkPendingVerification(structuredClone(EMPTY_STATE), baseDeps()); + expect(r.armed).toBe(false); + }); + + it('pending-verification with bootCount<=2 arms a timer and increments bootCount', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'pending-verification' as const, + targetTag: 'v2.7.3', + fromSha: 'abc', + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + const r = checkPendingVerification(state, deps); + expect(r.armed).toBe(true); + expect(deps.saveState).toHaveBeenCalledWith(expect.objectContaining({bootCount: 1})); + // markVerified clears the timer; advancing past the deadline does NOT trigger rollback. + r.markVerified(); + await vi.advanceTimersByTimeAsync(60_000); + await vi.runAllTimersAsync(); + expect(deps.exit).not.toHaveBeenCalled(); + }); + + it('markVerified persists the verified state with lastResult=verified', () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'pending-verification' as const, + targetTag: 'v2.7.3', fromSha: 'abc', + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + const r = checkPendingVerification(state, deps); + r.markVerified(); + const lastSave = (deps.saveState as any).mock.calls.at(-1)[0]; + expect(lastSave.execution.status).toBe('verified'); + expect(lastSave.lastResult.outcome).toBe('verified'); + expect(lastSave.bootCount).toBe(0); + }); + + it('pending-verification with bootCount>2 forces immediate rollback', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'pending-verification' as const, + targetTag: 'v2.7.3', fromSha: 'abc', + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 3, + }; + const r = checkPendingVerification(state, deps); + expect(r.armed).toBe(false); + await vi.runAllTimersAsync(); + expect(deps.exit).toHaveBeenCalledWith(75); + }); + + it('timer expiry triggers rollback when markVerified is never called', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'pending-verification' as const, + targetTag: 'v2.7.3', fromSha: 'abc', + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + const r = checkPendingVerification(state, deps); + expect(r.armed).toBe(true); + await vi.advanceTimersByTimeAsync(60_000); + await vi.runAllTimersAsync(); + expect(deps.exit).toHaveBeenCalledWith(75); + }); +}); + +describe('performRollback', () => { + it('happy path: restores lockfile, checks out fromSha, retries pnpm install, exits 75', async () => { + const deps = baseDeps(); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'rolling-back' as const, + reason: 'install-failed', + targetTag: 'v2.7.3', fromSha: 'abc', + at: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + await performRollback(state, deps); + expect(deps.copyFile).toHaveBeenCalledWith( + '/srv/etherpad/var/update-backup/pnpm-lock.yaml', + '/srv/etherpad/pnpm-lock.yaml', + ); + const lastSave = (deps.saveState as any).mock.calls.at(-1)[0]; + expect(lastSave.execution.status).toBe('rolled-back'); + expect(lastSave.lastResult.outcome).toBe('rolled-back'); + expect(deps.exit).toHaveBeenCalledWith(75); + }); + + it('rollback failure (lockfile copy throws) lands on rollback-failed terminal', async () => { + const deps = baseDeps(); + deps.copyFile = vi.fn(async () => { throw new Error('EACCES'); }); + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'rolling-back' as const, + reason: 'install-failed', + targetTag: 'v2.7.3', fromSha: 'abc', + at: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + await performRollback(state, deps); + const lastSave = (deps.saveState as any).mock.calls.at(-1)[0]; + expect(lastSave.execution.status).toBe('rollback-failed'); + expect(lastSave.lastResult.outcome).toBe('rollback-failed'); + expect(deps.exit).toHaveBeenCalledWith(75); + }); + + it('rollback failure (git checkout exits non-zero) lands on rollback-failed', async () => { + const deps = baseDeps(); + let calls = 0; + deps.spawnFn = vi.fn(() => ({ + stdout: {on: () => {}}, + stderr: {on: () => {}}, + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(calls++ === 0 ? 1 : 0)); }, + })) as any; + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'rolling-back' as const, + reason: 'build-failed', + targetTag: 'v2.7.3', fromSha: 'abc', + at: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + await performRollback(state, deps); + const lastSave = (deps.saveState as any).mock.calls.at(-1)[0]; + expect(lastSave.execution.status).toBe('rollback-failed'); + }); + + it('rollback failure (pnpm install exits non-zero) lands on rollback-failed', async () => { + const deps = baseDeps(); + let calls = 0; + deps.spawnFn = vi.fn(() => ({ + stdout: {on: () => {}}, + stderr: {on: () => {}}, + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(calls++ === 0 ? 0 : 1)); }, + })) as any; + const state = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'rolling-back' as const, + reason: 'build-failed', + targetTag: 'v2.7.3', fromSha: 'abc', + at: '2026-05-08T10:00:00Z', + }, + bootCount: 0, + }; + await performRollback(state, deps); + const lastSave = (deps.saveState as any).mock.calls.at(-1)[0]; + expect(lastSave.execution.status).toBe('rollback-failed'); + }); + + it('throws when called from an unexpected status', async () => { + const deps = baseDeps(); + const state = structuredClone(EMPTY_STATE); + await expect(performRollback(state, deps)).rejects.toThrow(); + }); +}); From f4ba409b7e8bfdf98422145189b665e103fea0e2 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 12:14:49 +0100 Subject: [PATCH 10/21] feat(updater): wire RollbackHandler into boot + UpdatePolicy honours rollback-failed - expressCreateServer now invokes checkPendingVerification before polling starts so a previous boot's pending-verification either re-arms the health-check timer or, when bootCount has climbed past the crash-loop threshold, forces an immediate rollback. - server.ts calls markBootHealthy after state hits RUNNING so /health-being-up is the implicit happy-path signal that cancels the rollback timer. - /admin/update/status surfaces execution + lastResult + lockHeld so the admin UI can render the right Apply / Cancel / Acknowledge state. - UpdatePolicy gains an `executionStatus` input. While it equals 'rollback-failed', canAuto / canAutonomous are denied (reason: rollback-failed-terminal); manual stays on because clicking Apply IS the intervention the terminal state needs. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/hooks/express/updateStatus.ts | 15 ++++++- src/node/server.ts | 11 +++++ src/node/updater/UpdatePolicy.ts | 28 +++++++++---- src/node/updater/index.ts | 40 +++++++++++++++++++ .../specs/updater/UpdatePolicy.test.ts | 31 ++++++++++++++ 5 files changed, 117 insertions(+), 8 deletions(-) diff --git a/src/node/hooks/express/updateStatus.ts b/src/node/hooks/express/updateStatus.ts index db30cf52c1d..24ab729e96b 100644 --- a/src/node/hooks/express/updateStatus.ts +++ b/src/node/hooks/express/updateStatus.ts @@ -1,11 +1,13 @@ 'use strict'; +import path from 'node:path'; import {ArgsExpressType} from '../../types/ArgsExpressType'; import settings, {getEpVersion} from '../../utils/Settings'; import {getDetectedInstallMethod, stateFilePath} from '../../updater'; import {evaluatePolicy} from '../../updater/UpdatePolicy'; import {compareSemver, isMajorBehind, isVulnerable} from '../../updater/versionCompare'; import {loadState} from '../../updater/state'; +import {isHeld} from '../../updater/lock'; let badgeCache: {value: 'severe' | 'vulnerable' | null; at: number} = {value: null, at: 0}; @@ -77,8 +79,15 @@ export const expressCreateServer = ( const current = getEpVersion(); const installMethod = getDetectedInstallMethod(); const policy = state.latest - ? evaluatePolicy({installMethod, tier: settings.updates.tier, current, latest: state.latest.version}) + ? evaluatePolicy({ + installMethod, + tier: settings.updates.tier, + current, + latest: state.latest.version, + executionStatus: state.execution.status, + }) : null; + const lockHeld = await isHeld(path.join(settings.root, 'var', 'update.lock')); res.json({ currentVersion: current, latest: state.latest, @@ -87,6 +96,10 @@ export const expressCreateServer = ( tier: settings.updates.tier, policy, vulnerableBelow: state.vulnerableBelow, + // PR 2 additions: + execution: state.execution, + lastResult: state.lastResult, + lockHeld, }); })); diff --git a/src/node/server.ts b/src/node/server.ts index 2e06cf6f26a..bef6af07017 100755 --- a/src/node/server.ts +++ b/src/node/server.ts @@ -177,6 +177,17 @@ exports.start = async () => { // @ts-ignore startDoneGate.resolve(); + // Once the server is RUNNING, /health responds 200 — that is the implicit + // health signal the updater's pending-verification timer is waiting for. + // Wrapped in try/catch because it must never block startup on a bug here. + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const updater = require('./updater'); + if (typeof updater.markBootHealthy === 'function') updater.markBootHealthy(); + } catch (err) { + logger.debug(`markBootHealthy: ${(err as Error).message}`); + } + // Return the HTTP server to make it easier to write tests. return express.server; }; diff --git a/src/node/updater/UpdatePolicy.ts b/src/node/updater/UpdatePolicy.ts index ed00229da8e..c9ace999690 100644 --- a/src/node/updater/UpdatePolicy.ts +++ b/src/node/updater/UpdatePolicy.ts @@ -10,14 +10,27 @@ export interface PolicyInput { tier: Tier; current: string; latest: string; + /** + * Optional execution-status hint. Only `rollback-failed` materially changes + * policy: while it's set, canAuto / canAutonomous are denied (an admin must + * acknowledge first). canManual stays on because clicking Apply *is* the + * intervention the terminal state requires. + */ + executionStatus?: string; } /** - * Decide which update tiers are allowed under the given (installMethod, tier, current, latest). - * Pure function — no I/O. The single source of truth for "what's allowed in this environment." - * `reason` is one of: 'tier-off' | 'up-to-date' | 'install-method-not-writable' | 'ok'. + * Decide which update tiers are allowed under the given (installMethod, tier, + * current, latest, executionStatus). Pure function — no I/O. The single source + * of truth for "what's allowed in this environment." + * + * `reason` is one of: + * 'tier-off' | 'up-to-date' | 'install-method-not-writable' + * | 'rollback-failed-terminal' | 'ok'. */ -export const evaluatePolicy = ({installMethod, tier, current, latest}: PolicyInput): PolicyResult => { +export const evaluatePolicy = ({ + installMethod, tier, current, latest, executionStatus, +}: PolicyInput): PolicyResult => { if (tier === 'off') { return {canNotify: false, canManual: false, canAuto: false, canAutonomous: false, reason: 'tier-off'}; } @@ -32,11 +45,12 @@ export const evaluatePolicy = ({installMethod, tier, current, latest}: PolicyInp return {canNotify, canManual: false, canAuto: false, canAutonomous: false, reason: 'install-method-not-writable'}; } + const terminal = executionStatus === 'rollback-failed'; return { canNotify, canManual: tier === 'manual' || tier === 'auto' || tier === 'autonomous', - canAuto: tier === 'auto' || tier === 'autonomous', - canAutonomous: tier === 'autonomous', - reason: 'ok', + canAuto: !terminal && (tier === 'auto' || tier === 'autonomous'), + canAutonomous: !terminal && tier === 'autonomous', + reason: terminal ? 'rollback-failed-terminal' : 'ok', }; }; diff --git a/src/node/updater/index.ts b/src/node/updater/index.ts index 475c0599231..22e08042de5 100644 --- a/src/node/updater/index.ts +++ b/src/node/updater/index.ts @@ -1,4 +1,6 @@ import path from 'node:path'; +import {spawn} from 'node:child_process'; +import fs from 'node:fs/promises'; import log4js from 'log4js'; import settings, {getEpVersion} from '../utils/Settings'; import {detectInstallMethod} from './InstallMethodDetector'; @@ -7,6 +9,8 @@ import {loadState, saveState} from './state'; import {isMajorBehind, isVulnerable} from './versionCompare'; import {evaluatePolicy} from './UpdatePolicy'; import {decideEmails} from './Notifier'; +import {checkPendingVerification, CheckResult, RollbackDeps} from './RollbackHandler'; +import type {SpawnFn} from './UpdateExecutor'; import {InstallMethod, UpdateState} from './types'; const logger = log4js.getLogger('updater'); @@ -16,6 +20,7 @@ let timer: NodeJS.Timeout | null = null; let initialTimer: NodeJS.Timeout | null = null; let checkInFlight = false; let inMemoryState: UpdateState | null = null; +let pendingVerification: CheckResult | null = null; export const stateFilePath = () => path.join(settings.root, 'var', 'update-state.json'); @@ -126,6 +131,21 @@ const startPolling = (): void => { initialTimer = setTimeout(() => { initialTimer = null; void performCheck(); }, 5000); }; +/** Build the dependency bundle RollbackHandler / UpdateExecutor expect. */ +export const getRollbackDeps = (): RollbackDeps => ({ + repoDir: settings.root, + backupDir: path.join(settings.root, 'var', 'update-backup'), + spawnFn: spawn as unknown as SpawnFn, + copyFile: async (src: string, dst: string) => { + await fs.mkdir(path.dirname(dst), {recursive: true}); + await fs.copyFile(src, dst); + }, + saveState: (s: UpdateState) => saveState(stateFilePath(), s), + exit: (code: number) => process.exit(code), + now: () => new Date(), + rollbackHealthCheckSeconds: Number(settings.updates.rollbackHealthCheckSeconds) || 60, +}); + /** Hook entry point — called by ep.json on createServer. */ export const expressCreateServer = async (): Promise => { detectedMethod = await detectInstallMethod({ @@ -133,9 +153,29 @@ export const expressCreateServer = async (): Promise => { rootDir: settings.root, }); logger.info(`updater: install method = ${detectedMethod}, tier = ${settings.updates.tier}`); + + // Tier 2: if the previous boot left the state in pending-verification, arm + // the health-check timer (or force rollback when bootCount has climbed past + // the crash-loop threshold). This must run BEFORE polling starts so the + // rollback can fire even if the version checker is misconfigured. + const state = await getCurrentState(); + pendingVerification = checkPendingVerification(state, getRollbackDeps()); + if (settings.updates.tier !== 'off') startPolling(); }; +/** + * Called by the Etherpad runtime once the express stack is fully wired and + * /health responds — that's the implicit health signal the + * pending-verification timer is waiting for. + */ +export const markBootHealthy = (): void => { + if (pendingVerification) { + pendingVerification.markVerified(); + pendingVerification = null; + } +}; + /** Shutdown hook. */ export const shutdown = async (): Promise => { if (timer) { clearInterval(timer); timer = null; } diff --git a/src/tests/backend-new/specs/updater/UpdatePolicy.test.ts b/src/tests/backend-new/specs/updater/UpdatePolicy.test.ts index 6dfd0f95451..3eb74ef01bf 100644 --- a/src/tests/backend-new/specs/updater/UpdatePolicy.test.ts +++ b/src/tests/backend-new/specs/updater/UpdatePolicy.test.ts @@ -62,3 +62,34 @@ describe('evaluatePolicy', () => { expect(r.reason).toBe('up-to-date'); }); }); + +describe('evaluatePolicy terminal-state gating', () => { + it('rollback-failed denies auto/autonomous but keeps manual on', () => { + const r = evaluatePolicy({ + ...baseInput, tier: 'autonomous', + executionStatus: 'rollback-failed', + }); + expect(r.canNotify).toBe(true); + expect(r.canManual).toBe(true); + expect(r.canAuto).toBe(false); + expect(r.canAutonomous).toBe(false); + expect(r.reason).toBe('rollback-failed-terminal'); + }); + + it('idle execution behaves identically to no-status', () => { + const r = evaluatePolicy({...baseInput, tier: 'autonomous', executionStatus: 'idle'}); + expect(r.canManual).toBe(true); + expect(r.canAuto).toBe(true); + expect(r.canAutonomous).toBe(true); + expect(r.reason).toBe('ok'); + }); + + it('preflight-failed does NOT block manual / auto (it is informational only)', () => { + const r = evaluatePolicy({ + ...baseInput, tier: 'autonomous', executionStatus: 'preflight-failed', + }); + expect(r.canManual).toBe(true); + expect(r.canAuto).toBe(true); + expect(r.canAutonomous).toBe(true); + }); +}); From 11dd991eb0c1c7762c472837b51d192af6cb8731 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:03:51 +0100 Subject: [PATCH 11/21] feat(updater): apply / cancel / acknowledge / log endpoints Strict admin-only POSTs that drive Tier 2's manual-click flow: - POST /admin/update/apply: acquire lock, persist preflight, run preflight, drain $drainSeconds, executeUpdate (which exits 75 on success), or run performRollback on a failure path (also exits 75). - POST /admin/update/cancel: cancel a pre-execute drain/preflight, write cancelled lastResult, release lock. - POST /admin/update/acknowledge: clear terminal states (preflight-failed, rolled-back, rollback-failed) back to idle. lastResult is preserved so the admin still sees what happened. - GET /admin/update/log: tail var/log/update.log (200 lines) for the in- progress UI. Strict admin auth. Also: - socketio hook exports getIo() so the apply endpoint can broadcast the drain shoutMessage outside the regular hook surface. - ep.json registers updateActions after admin/updateStatus. - 11 mocha integration tests cover auth, policy denial, execution-busy, acknowledge-clears-terminal, log content-type. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ep.json | 7 + src/node/hooks/express/socketio.ts | 3 + src/node/hooks/express/updateActions.ts | 311 +++++++++++++++++++++++ src/tests/backend/specs/updateActions.ts | 184 ++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 src/node/hooks/express/updateActions.ts create mode 100644 src/tests/backend/specs/updateActions.ts diff --git a/src/ep.json b/src/ep.json index bf90c52d43b..08fc734fe71 100644 --- a/src/ep.json +++ b/src/ep.json @@ -116,6 +116,13 @@ "expressCreateServer": "ep_etherpad-lite/node/hooks/express/updateStatus" } }, + { + "name": "updateActions", + "post": ["ep_etherpad-lite/admin"], + "hooks": { + "expressCreateServer": "ep_etherpad-lite/node/hooks/express/updateActions" + } + }, { "name": "admin", "hooks": { diff --git a/src/node/hooks/express/socketio.ts b/src/node/hooks/express/socketio.ts index 9184eff8831..79ef892760b 100644 --- a/src/node/hooks/express/socketio.ts +++ b/src/node/hooks/express/socketio.ts @@ -14,6 +14,9 @@ const padMessageHandler = require('../../handler/PadMessageHandler'); let io:any; const logger = log4js.getLogger('socket.io'); + +/** Returns the socket.io Server once expressCreateServer has run, or null otherwise. Used by features that need to broadcast outside the regular hook surface. */ +export const getIo = (): any => io; const sockets = new Set(); const socketsEvents = new events.EventEmitter(); diff --git a/src/node/hooks/express/updateActions.ts b/src/node/hooks/express/updateActions.ts new file mode 100644 index 00000000000..cd1c1579bca --- /dev/null +++ b/src/node/hooks/express/updateActions.ts @@ -0,0 +1,311 @@ +'use strict'; + +import path from 'node:path'; +import fs from 'node:fs/promises'; +import {spawn} from 'node:child_process'; +import log4js from 'log4js'; +import {ArgsExpressType} from '../../types/ArgsExpressType'; +import settings, {getEpVersion} from '../../utils/Settings'; +import {getDetectedInstallMethod, stateFilePath, getRollbackDeps} from '../../updater'; +import {evaluatePolicy} from '../../updater/UpdatePolicy'; +import {loadState, saveState} from '../../updater/state'; +import {acquireLock, releaseLock} from '../../updater/lock'; +import {executeUpdate, SpawnFn} from '../../updater/UpdateExecutor'; +import {createDrainer, DrainBroadcastKey, Drainer} from '../../updater/SessionDrainer'; +import {runPreflight} from '../../updater/preflight'; +import {verifyReleaseTag} from '../../updater/trustedKeys'; +import {tailLines, appendLine} from '../../updater/updateLog'; +import {performRollback} from '../../updater/RollbackHandler'; +import {UpdateState} from '../../updater/types'; +import {getIo} from './socketio'; + +const logger = log4js.getLogger('updater'); + +const lockPath = (): string => path.join(settings.root, 'var', 'update.lock'); +const logPath = (): string => path.join(settings.root, 'var', 'log', 'update.log'); +const backupDir = (): string => path.join(settings.root, 'var', 'update-backup'); + +let drainer: Drainer | null = null; + +const requireAdmin = (req: any, res: any): boolean => { + const u = req.session?.user; + if (!u) { res.status(401).send('Authentication required'); return false; } + if (!u.is_admin) { res.status(403).send('Forbidden'); return false; } + return true; +}; + +const wrapAsync = + (fn: (req: any, res: any, next: Function) => Promise) => + (req: any, res: any, next: Function) => Promise.resolve(fn(req, res, next)).catch((err) => next(err)); + +const broadcastShout = (key: DrainBroadcastKey, values: Record): void => { + try { + const io = getIo(); + if (!io) return; + const message = { + type: 'COLLABROOM', + data: { + type: 'shoutMessage', + payload: { + // i18n key + values are picked up by the pad-side renderer (Task 14). + message: {message: key, values, sticky: false}, + timestamp: Date.now(), + }, + }, + }; + io.sockets.emit('shout', message); + } catch (err) { + logger.warn(`broadcastShout: ${(err as Error).message}`); + } +}; + +const buildPreflightDeps = (installMethod: ReturnType) => ({ + installMethod, + workingTreeClean: () => new Promise((resolve) => { + const c = spawn('git', ['status', '--porcelain'], {cwd: settings.root}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', () => resolve(out.trim().length === 0)); + c.on('error', () => resolve(false)); + }), + freeDiskMB: async (): Promise => { + try { + const s = await (fs as any).statfs?.(settings.root); + if (!s) return Number.POSITIVE_INFINITY; + return Math.floor((Number(s.bavail) * Number(s.bsize)) / (1024 * 1024)); + } catch { + // statfs unsupported on this platform — treat as "no constraint" rather than block. + return Number.POSITIVE_INFINITY; + } + }, + pnpmOnPath: () => new Promise((resolve) => { + const c = spawn('pnpm', ['--version'], {stdio: 'ignore'}); + c.on('close', (code) => resolve(code === 0)); + c.on('error', () => resolve(false)); + }), + // We just acquired the lock in the apply endpoint, so don't double-check it here. + lockHeld: async () => false, + remoteHasTag: (tag: string) => new Promise((resolve) => { + const c = spawn('git', ['ls-remote', '--tags', 'origin', tag], + {cwd: settings.root, stdio: ['ignore', 'pipe', 'ignore']}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', () => resolve(out.trim().length > 0)); + c.on('error', () => resolve(false)); + }), + verifyTag: () => verifyReleaseTag({ + tag: '', // overridden below — we close over targetTag + repoDir: settings.root, + requireSignature: settings.updates.requireSignature, + trustedKeysPath: settings.updates.trustedKeysPath, + }), +}); + +export const expressCreateServer = ( + _hookName: string, + {app}: ArgsExpressType, + cb: Function, +): void => { + if (settings.updates.tier === 'off') return cb(); + + app.post('/admin/update/apply', wrapAsync(async (req: any, res: any) => { + if (!requireAdmin(req, res)) return; + + const state = await loadState(stateFilePath()); + if (!state.latest) return res.status(409).json({error: 'no-known-latest'}); + + // Allowed entry statuses: idle / verified / preflight-failed / rolled-back. + // Anything else means an in-flight or terminal-needs-acknowledge state. + const allowedEntry = ['idle', 'verified', 'preflight-failed', 'rolled-back']; + if (!allowedEntry.includes(state.execution.status)) { + return res.status(409).json({error: `execution-busy:${state.execution.status}`}); + } + + const installMethod = getDetectedInstallMethod(); + const policy = evaluatePolicy({ + installMethod, + tier: settings.updates.tier, + current: getEpVersion(), + latest: state.latest.version, + executionStatus: state.execution.status, + }); + if (!policy.canManual) { + return res.status(409).json({error: 'policy-denied', reason: policy.reason}); + } + + if (!await acquireLock(lockPath())) { + return res.status(409).json({error: 'lock-held'}); + } + + const targetTag = state.latest.tag; + let cleanupLock = true; + + try { + // Persist preflight state. + const startedAt = new Date().toISOString(); + const preState: UpdateState = { + ...state, + execution: {status: 'preflight', targetTag, startedAt}, + }; + await saveState(stateFilePath(), preState); + appendLine(logPath(), `[${startedAt}] PREFLIGHT target=${targetTag}`); + + const baseDeps = buildPreflightDeps(installMethod); + const pf = await runPreflight( + { + targetTag, + diskSpaceMinMB: Number(settings.updates.diskSpaceMinMB) || 500, + requireSignature: settings.updates.requireSignature, + trustedKeysPath: settings.updates.trustedKeysPath, + }, + { + ...baseDeps, + verifyTag: () => verifyReleaseTag({ + tag: targetTag, + repoDir: settings.root, + requireSignature: settings.updates.requireSignature, + trustedKeysPath: settings.updates.trustedKeysPath, + }), + }, + ); + + if (!pf.ok) { + const at = new Date().toISOString(); + await saveState(stateFilePath(), { + ...preState, + execution: {status: 'preflight-failed', targetTag, reason: pf.reason, at}, + lastResult: { + targetTag, fromSha: '', + outcome: 'preflight-failed', reason: pf.reason, at, + }, + }); + appendLine(logPath(), `[${at}] PREFLIGHT_FAILED ${pf.reason}`); + cleanupLock = true; + return res.status(409).json({error: 'preflight-failed', reason: pf.reason}); + } + + // Drain — respond 202 first so the UI starts polling /log without waiting. + const drainSeconds = Number(settings.updates.drainSeconds) || 60; + drainer = createDrainer({ + drainSeconds, + broadcast: (key, values) => broadcastShout(key, values), + }); + const drainEndsAt = new Date(Date.now() + drainSeconds * 1000).toISOString(); + await saveState(stateFilePath(), { + ...preState, + execution: {status: 'draining', targetTag, drainEndsAt, startedAt: new Date().toISOString()}, + }); + appendLine(logPath(), `[${new Date().toISOString()}] DRAIN start drainSeconds=${drainSeconds}`); + + res.status(202).json({accepted: true, drainEndsAt}); + + const drainResult = await drainer.start(); + drainer = null; + if (drainResult.outcome === 'cancelled') { + // /admin/update/cancel already updated state and lastResult; just release the lock. + appendLine(logPath(), `[${new Date().toISOString()}] DRAIN cancelled by admin`); + return; + } + + // Re-load state right before the executor runs so anything the cancel + // endpoint or another concurrent handler wrote is honoured. + const fresh = await loadState(stateFilePath()); + + const r = await executeUpdate({ + repoDir: settings.root, + backupDir: backupDir(), + spawnFn: spawn as unknown as SpawnFn, + readSha: () => new Promise((resolve, reject) => { + const c = spawn('git', ['rev-parse', 'HEAD'], + {cwd: settings.root, stdio: ['ignore', 'pipe', 'ignore']}); + let out = ''; + c.stdout.on('data', (b) => { out += b.toString(); }); + c.on('close', (code) => code === 0 + ? resolve(out.trim()) + : reject(new Error(`git rev-parse exit ${code}`))); + c.on('error', reject); + }), + copyFile: async (src: string, dst: string) => { + await fs.mkdir(path.dirname(dst), {recursive: true}); + await fs.copyFile(src, dst); + }, + saveState: (s: UpdateState) => saveState(stateFilePath(), s), + initialState: fresh, + targetTag, + now: () => new Date(), + // executeUpdate calls exit on success (75) — that takes the process down, + // so anything after this is the failure path. + exit: (code: number) => process.exit(code), + }); + + // Failure paths: executor returned without exiting, state is rolling-back. + if (r.outcome !== 'pending-verification') { + const after = await loadState(stateFilePath()); + if (after.execution.status === 'rolling-back') { + // performRollback will exit 75 on either success or terminal failure. + // We do not release the lock — exit takes the process down and the + // next-boot acquireLock reaps the stale PID. + cleanupLock = false; + await performRollback(after, getRollbackDeps()); + } + } + } catch (err) { + logger.error(`apply failed: ${(err as Error).stack || err}`); + appendLine(logPath(), `[${new Date().toISOString()}] APPLY_ERROR ${(err as Error).message}`); + if (!res.headersSent) res.status(500).json({error: 'internal'}); + } finally { + if (cleanupLock) { + try { await releaseLock(lockPath()); } + catch (err) { logger.warn(`releaseLock: ${(err as Error).message}`); } + } + } + })); + + app.post('/admin/update/cancel', wrapAsync(async (req: any, res: any) => { + if (!requireAdmin(req, res)) return; + const state = await loadState(stateFilePath()); + // Cancel is allowed only during pre-execute states. Once executing begins + // (filesystem mutated) we either complete or rollback — see spec section + // "Error handling" / state machine. + if (state.execution.status !== 'preflight' && state.execution.status !== 'draining') { + return res.status(409).json({error: 'not-cancellable', status: state.execution.status}); + } + if (drainer) drainer.cancel(); + const at = new Date().toISOString(); + await saveState(stateFilePath(), { + ...state, + execution: {status: 'idle'}, + lastResult: { + targetTag: (state.execution as {targetTag?: string}).targetTag ?? '', + fromSha: '', + outcome: 'cancelled', + reason: 'admin-cancelled', + at, + }, + }); + try { await releaseLock(lockPath()); } catch {/* noop */} + appendLine(logPath(), `[${at}] CANCEL by admin during status=${state.execution.status}`); + res.json({cancelled: true}); + })); + + app.post('/admin/update/acknowledge', wrapAsync(async (req: any, res: any) => { + if (!requireAdmin(req, res)) return; + const state = await loadState(stateFilePath()); + const terminal: ReadonlySet = new Set(['rollback-failed', 'preflight-failed', 'rolled-back']); + if (!terminal.has(state.execution.status)) { + return res.status(409).json({error: 'not-terminal', status: state.execution.status}); + } + await saveState(stateFilePath(), {...state, execution: {status: 'idle'}, bootCount: 0}); + appendLine(logPath(), `[${new Date().toISOString()}] ACKNOWLEDGE ${state.execution.status} -> idle`); + res.json({acknowledged: true}); + })); + + app.get('/admin/update/log', wrapAsync(async (req: any, res: any) => { + if (!requireAdmin(req, res)) return; + const lines = await tailLines(logPath(), 200); + res.set('Content-Type', 'text/plain; charset=utf-8'); + res.send(lines.join('\n')); + })); + + cb(); +}; diff --git a/src/tests/backend/specs/updateActions.ts b/src/tests/backend/specs/updateActions.ts new file mode 100644 index 00000000000..faeb24b874c --- /dev/null +++ b/src/tests/backend/specs/updateActions.ts @@ -0,0 +1,184 @@ +'use strict'; + +const assert = require('assert').strict; +const common = require('../common'); +const plugins = require('../../../static/js/pluginfw/plugin_defs'); +import settings from '../../../node/utils/Settings'; +import {saveState} from '../../../node/updater/state'; +import {EMPTY_STATE} from '../../../node/updater/types'; +import path from 'node:path'; + +const statePath = () => path.join(settings.root, 'var', 'update-state.json'); +const lockPath = () => path.join(settings.root, 'var', 'update.lock'); + +const authHookNames = ['preAuthorize', 'authenticate', 'authorize']; +const failHookNames = ['preAuthzFailure', 'authnFailure', 'authzFailure', 'authFailure']; + +const installAdminAuth = () => { + for (const h of authHookNames.concat(failHookNames)) plugins.hooks[h] = []; + plugins.hooks.authenticate = [{ + hook_fn: (_n: string, ctx: any, cb: Function) => { + ctx.req.session.user = {is_admin: true}; + cb([true]); + }, + }]; + (settings as any).requireAuthentication = true; + (settings as any).requireAuthorization = false; + (settings as any).users = {admin: {password: 'admin-pw', is_admin: true}}; +}; + +describe(__filename, function () { + let agent: any; + const backups: Record = {}; + + before(async () => { agent = await common.init(); }); + + beforeEach(async () => { + backups.hooks = {}; + for (const n of authHookNames.concat(failHookNames)) backups.hooks[n] = plugins.hooks[n]; + backups.settings = {}; + for (const k of ['requireAuthentication', 'requireAuthorization', 'users']) { + backups.settings[k] = (settings as any)[k]; + } + // Seed a known "update available" state so apply has a target tag. + await saveState(statePath(), { + ...EMPTY_STATE, + latest: { + version: '99.0.0', tag: 'v99.0.0', body: 'release notes', + publishedAt: '2099-01-01T00:00:00Z', prerelease: false, + htmlUrl: 'https://example/r/v99.0.0', + }, + }); + // Ensure no stale lock from an earlier test. + try { require('node:fs').unlinkSync(lockPath()); } catch {/* noop */} + }); + + afterEach(() => { + Object.assign(plugins.hooks, backups.hooks); + Object.assign(settings, backups.settings); + }); + + describe('POST /admin/update/apply', function () { + it('rejects unauthenticated', async () => { + await agent.post('/admin/update/apply').expect(401); + }); + + it('rejects when policy denies (non-git install method)', async () => { + installAdminAuth(); + // Force the detector path: the boot detector ran with the real install + // method, but evaluatePolicy uses settings.updates.installMethod via the + // hook's getDetectedInstallMethod(). We can't easily flip that mid-test, + // so instead we set tier=off which also denies canManual. + const orig = settings.updates.tier; + settings.updates.tier = 'off'; + try { + await agent.post('/admin/update/apply') + .auth('admin', 'admin-pw') + .expect((r: any) => { + // tier=off removes the entire route registration, so we expect 404. + // tier !== off and policy.canManual=false would expect 409. Either is OK. + if (r.status !== 404 && r.status !== 409) { + throw new Error(`expected 404 or 409, got ${r.status}`); + } + }); + } finally { settings.updates.tier = orig; } + }); + + it('rejects when execution is already in flight (409)', async () => { + installAdminAuth(); + await saveState(statePath(), { + ...EMPTY_STATE, + latest: { + version: '99.0.0', tag: 'v99.0.0', body: '', publishedAt: '', + prerelease: false, htmlUrl: '', + }, + execution: { + status: 'executing', targetTag: 'v99.0.0', fromSha: 'x', + startedAt: '2026-05-08T00:00:00Z', + }, + }); + const r = await agent.post('/admin/update/apply') + .auth('admin', 'admin-pw') + .expect(409); + assert.match(r.body.error, /execution-busy/); + }); + }); + + describe('POST /admin/update/cancel', function () { + it('rejects unauthenticated', async () => { + await agent.post('/admin/update/cancel').expect(401); + }); + + it('returns 409 when nothing is in flight', async () => { + installAdminAuth(); + await agent.post('/admin/update/cancel').auth('admin', 'admin-pw').expect(409); + }); + }); + + describe('POST /admin/update/acknowledge', function () { + it('rejects unauthenticated', async () => { + await agent.post('/admin/update/acknowledge').expect(401); + }); + + it('clears a terminal rollback-failed state to idle', async () => { + installAdminAuth(); + await saveState(statePath(), { + ...EMPTY_STATE, + execution: { + status: 'rollback-failed', + reason: 'install-failed; rollback failed: pnpm exit 1', + targetTag: 'v99.0.0', fromSha: 'x', + at: '2026-05-08T00:00:00Z', + }, + lastResult: { + targetTag: 'v99.0.0', fromSha: 'x', + outcome: 'rollback-failed', + reason: 'pnpm install failed', + at: '2026-05-08T00:00:00Z', + }, + }); + await agent.post('/admin/update/acknowledge') + .auth('admin', 'admin-pw').expect(200); + const status = await agent.get('/admin/update/status').expect(200); + assert.equal(status.body.execution.status, 'idle'); + // lastResult is preserved on acknowledge so the admin still sees what happened. + assert.equal(status.body.lastResult.outcome, 'rollback-failed'); + }); + + it('clears a preflight-failed state to idle', async () => { + installAdminAuth(); + await saveState(statePath(), { + ...EMPTY_STATE, + execution: { + status: 'preflight-failed', + targetTag: 'v99.0.0', + reason: 'low-disk-space', + at: '2026-05-08T00:00:00Z', + }, + }); + await agent.post('/admin/update/acknowledge') + .auth('admin', 'admin-pw').expect(200); + }); + + it('refuses to clear a non-terminal state (409)', async () => { + installAdminAuth(); + await saveState(statePath(), {...EMPTY_STATE}); + await agent.post('/admin/update/acknowledge') + .auth('admin', 'admin-pw').expect(409); + }); + }); + + describe('GET /admin/update/log', function () { + it('rejects unauthenticated', async () => { + await agent.get('/admin/update/log').expect(401); + }); + + it('returns a text body (possibly empty) for an admin', async () => { + installAdminAuth(); + const res = await agent.get('/admin/update/log') + .auth('admin', 'admin-pw').expect(200); + assert.equal(typeof res.text, 'string'); + assert.match(res.headers['content-type'], /text\/plain/); + }); + }); +}); From 719f1b4ba7c44b5f82c8019e61af6cf70a4c0d09 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:05:55 +0100 Subject: [PATCH 12/21] feat(updater): admin UI Apply/Cancel/Acknowledge + live log stream UpdatePage renders the right action set based on execution.status: Apply when idle/verified and policy allows, Cancel during preflight/draining, Acknowledge on terminal preflight-failed / rolled-back / rollback-failed. While the executor is in flight (preflight/draining/executing/rolling-back) the page polls /admin/update/log + /admin/update/status once a second and shows the rolling tail; polling stops automatically when the run terminates. lastResult and policy denial reasons surface localised copy. Buttons disable themselves while a network round-trip is in flight to dodge double-clicks. New i18n keys live under update.page.{apply,cancel, acknowledge,log,execution,policy.*,last_result.*}, update.execution.*, update.banner.terminal.rollback-failed, and update.drain.{t60,t30,t10}. Co-Authored-By: Claude Opus 4.7 (1M context) --- admin/src/pages/UpdatePage.tsx | 132 +++++++++++++++++++++++++++------ admin/src/store/store.ts | 28 +++++++ src/locales/en.json | 28 +++++++ 3 files changed, 167 insertions(+), 21 deletions(-) diff --git a/admin/src/pages/UpdatePage.tsx b/admin/src/pages/UpdatePage.tsx index 0d669a446f3..8e9c3354884 100644 --- a/admin/src/pages/UpdatePage.tsx +++ b/admin/src/pages/UpdatePage.tsx @@ -9,37 +9,75 @@ type FetchState = | {kind: 'error', status: number} | {kind: 'ok'}; +const IN_FLIGHT_STATUSES = ['preflight', 'draining', 'executing', 'rolling-back']; + export const UpdatePage = () => { const {t} = useTranslation(); const us = useStore((s) => s.updateStatus); const setUpdateStatus = useStore((s) => s.setUpdateStatus); + const log = useStore((s) => s.updateLog); + const setLog = useStore((s) => s.setUpdateLog); // Self-fetch so the page renders an explicit state even if UpdateBanner's // best-effort fetch never landed (route returns 404 when tier=off, 401/403 // if requireAdminForStatus is set, or a transient network error). const [fetchState, setFetchState] = useState(us ? {kind: 'ok'} : {kind: 'loading'}); + const [actionInFlight, setActionInFlight] = useState(false); + + const refreshStatus = async () => { + try { + const r = await fetch('/admin/update/status', {credentials: 'same-origin'}); + if (r.ok) { + const data = await r.json(); + setUpdateStatus(data); + setFetchState({kind: 'ok'}); + } else if (r.status === 404) { + setFetchState({kind: 'disabled'}); + } else if (r.status === 401 || r.status === 403) { + setFetchState({kind: 'unauthorized'}); + } else { + setFetchState({kind: 'error', status: r.status}); + } + } catch { + setFetchState({kind: 'error', status: 0}); + } + }; useEffect(() => { let cancelled = false; - fetch('/admin/update/status', {credentials: 'same-origin'}) - .then(async (r) => { - if (cancelled) return; - if (r.ok) { - const data = await r.json(); - setUpdateStatus(data); - setFetchState({kind: 'ok'}); - } else if (r.status === 404) { - setFetchState({kind: 'disabled'}); - } else if (r.status === 401 || r.status === 403) { - setFetchState({kind: 'unauthorized'}); - } else { - setFetchState({kind: 'error', status: r.status}); - } - }) - .catch(() => { - if (!cancelled) setFetchState({kind: 'error', status: 0}); - }); + void refreshStatus().then(() => { if (cancelled) return; }); return () => { cancelled = true; }; - }, [setUpdateStatus]); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + // Poll log + status while the executor is in flight, then stop. + const status = us?.execution?.status ?? 'idle'; + const inFlight = IN_FLIGHT_STATUSES.includes(status); + useEffect(() => { + if (!inFlight) return; + let cancelled = false; + const tick = async () => { + if (cancelled) return; + try { + const lr = await fetch('/admin/update/log', {credentials: 'same-origin'}); + if (lr.ok) setLog(await lr.text()); + } catch {/* noop */} + await refreshStatus(); + if (!cancelled) setTimeout(tick, 1000); + }; + void tick(); + return () => { cancelled = true; }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [inFlight]); + + const post = async (path: string) => { + setActionInFlight(true); + try { + await fetch(path, {method: 'POST', credentials: 'same-origin'}); + await refreshStatus(); + } finally { + setActionInFlight(false); + } + }; if (fetchState.kind === 'loading') { return
{t('admin.loading', {defaultValue: 'Loading...'})}
; @@ -61,16 +99,22 @@ export const UpdatePage = () => { ); } if (fetchState.kind === 'error' || !us) { - const status = fetchState.kind === 'error' ? fetchState.status : 0; + const stat = fetchState.kind === 'error' ? fetchState.status : 0; return (

-

{t('update.page.error', {defaultValue: 'Could not load update status (status {{status}}).', status})}

+

{t('update.page.error', {defaultValue: 'Could not load update status (status {{status}}).', status: stat})}

); } const upToDate = !us.latest || us.currentVersion === us.latest.version; + const showApply = !!us.policy?.canManual + && (status === 'idle' || status === 'verified') + && !us.lockHeld + && !upToDate; + const showCancel = status === 'preflight' || status === 'draining'; + const showAcknowledge = status === 'preflight-failed' || status === 'rolled-back' || status === 'rollback-failed'; return (
@@ -86,7 +130,53 @@ export const UpdatePage = () => {
{us.installMethod}
{us.tier}
+
+
{t(`update.execution.${status}`, {defaultValue: status})}
+ + {us.lastResult && ( +

+ +

+ )} + + {us.policy && !us.policy.canManual && !upToDate && ( +

+ +

+ )} + +
+ {showApply && ( + + )} + {showCancel && ( + + )} + {showAcknowledge && ( + + )} +
+ + {inFlight && ( +
+

+
{log}
+
+ )} + {upToDate ? (

) : us.latest ? ( diff --git a/admin/src/store/store.ts b/admin/src/store/store.ts index f3748f47cd4..71c85b5036f 100644 --- a/admin/src/store/store.ts +++ b/admin/src/store/store.ts @@ -3,6 +3,26 @@ import {Socket} from "socket.io-client"; import {PadSearchResult} from "../utils/PadSearch.ts"; import {InstalledPlugin} from "../pages/Plugin.ts"; +export type Execution = + | {status: 'idle'} + | {status: 'preflight'; targetTag: string; startedAt: string} + | {status: 'preflight-failed'; targetTag: string; reason: string; at: string} + | {status: 'draining'; targetTag: string; drainEndsAt: string; startedAt: string} + | {status: 'executing'; targetTag: string; fromSha: string; startedAt: string} + | {status: 'pending-verification'; targetTag: string; fromSha: string; deadlineAt: string} + | {status: 'verified'; targetTag: string; verifiedAt: string} + | {status: 'rolling-back'; reason: string; targetTag: string; fromSha: string; at: string} + | {status: 'rolled-back'; reason: string; targetTag: string; restoredSha: string; at: string} + | {status: 'rollback-failed'; reason: string; targetTag: string; fromSha: string; at: string}; + +export type LastResult = null | { + targetTag: string; + fromSha: string; + outcome: 'verified' | 'rolled-back' | 'rollback-failed' | 'preflight-failed' | 'cancelled'; + reason: string | null; + at: string; +}; + export interface UpdateStatusPayload { currentVersion: string; latest: null | { @@ -18,6 +38,10 @@ export interface UpdateStatusPayload { tier: string; policy: null | {canNotify: boolean; canManual: boolean; canAuto: boolean; canAutonomous: boolean; reason: string}; vulnerableBelow: Array<{announcedBy: string; threshold: string}>; + // Tier 2 additions: + execution: Execution; + lastResult: LastResult; + lockHeld: boolean; } type ToastState = { @@ -45,6 +69,8 @@ type StoreState = { setInstalledPlugins: (plugins: InstalledPlugin[])=>void, updateStatus: UpdateStatusPayload | null, setUpdateStatus: (s: UpdateStatusPayload) => void, + updateLog: string, + setUpdateLog: (log: string) => void, } @@ -70,4 +96,6 @@ export const useStore = create()((set) => ({ setInstalledPlugins: (plugins)=>set({installedPlugins: plugins}), updateStatus: null, setUpdateStatus: (s) => set({updateStatus: s}), + updateLog: '', + setUpdateLog: (log) => set({updateLog: log}), })); diff --git a/src/locales/en.json b/src/locales/en.json index a8602ad35e0..7b899881f0e 100644 --- a/src/locales/en.json +++ b/src/locales/en.json @@ -48,6 +48,34 @@ "update.page.up_to_date": "You are running the latest version.", "update.badge.severe": "Etherpad on this server is severely outdated. Tell your admin.", "update.badge.vulnerable": "Etherpad on this server is running a version with known security issues. Tell your admin.", + "update.page.apply": "Apply update", + "update.page.cancel": "Cancel", + "update.page.acknowledge": "Acknowledge", + "update.page.log": "Update log (last 200 lines)", + "update.page.execution": "Status", + "update.page.policy.install-method-not-writable": "Updates from the admin UI require a git install. Update via your package manager.", + "update.page.policy.rollback-failed-terminal": "A previous update failed and could not be rolled back. Press Acknowledge after the install is healthy to clear the lock.", + "update.page.policy.up-to-date": "You are running the latest version.", + "update.page.policy.tier-off": "Updates are disabled (updates.tier = \"off\").", + "update.page.last_result.verified": "Last update to {{tag}} verified.", + "update.page.last_result.rolled-back": "Last attempted update to {{tag}} rolled back: {{reason}}.", + "update.page.last_result.rollback-failed": "Last update attempt failed AND rollback failed: {{reason}}. Manual intervention required.", + "update.page.last_result.preflight-failed": "Last attempted update to {{tag}} failed preflight: {{reason}}.", + "update.page.last_result.cancelled": "Last attempted update to {{tag}} cancelled by admin.", + "update.execution.idle": "Idle", + "update.execution.preflight": "Pre-flight checks", + "update.execution.preflight-failed": "Pre-flight failed", + "update.execution.draining": "Draining sessions", + "update.execution.executing": "Updating...", + "update.execution.pending-verification": "Pending verification", + "update.execution.verified": "Verified", + "update.execution.rolling-back": "Rolling back", + "update.execution.rolled-back": "Rolled back", + "update.execution.rollback-failed": "Rollback failed", + "update.banner.terminal.rollback-failed": "An update attempt failed and could not be rolled back. Manual intervention required.", + "update.drain.t60": "Etherpad will restart in 60 seconds to apply an update.", + "update.drain.t30": "Etherpad will restart in 30 seconds to apply an update.", + "update.drain.t10": "Etherpad will restart in 10 seconds to apply an update.", "index.newPad": "New Pad", "index.settings": "Settings", From dd79dafa8fa67d5fd7fb359ec1cdb485fdb53496 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:07:16 +0100 Subject: [PATCH 13/21] feat(updater): pad shoutMessage renders update.drain.* via html10n broadcastShout now sends {messageKey, values, sticky} so the existing pad-side shout pipeline can route through html10n.get(). The renderer gains a values pass-through so update.drain.t60 etc. interpolate {{seconds}}, and gives updater shouts a different gritter title (the banner.title localised string) so users know it's a system event rather than a generic admin message. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/hooks/express/updateActions.ts | 6 ++++-- src/static/js/pad.ts | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/node/hooks/express/updateActions.ts b/src/node/hooks/express/updateActions.ts index cd1c1579bca..01dbe72f1eb 100644 --- a/src/node/hooks/express/updateActions.ts +++ b/src/node/hooks/express/updateActions.ts @@ -42,13 +42,15 @@ const broadcastShout = (key: DrainBroadcastKey, values: Record) try { const io = getIo(); if (!io) return; + // The pad-side renderer (src/static/js/pad.ts) already handles `messageKey` + // by routing through html10n.get(); we add a `values` field that the + // renderer interpolates into the localised string. const message = { type: 'COLLABROOM', data: { type: 'shoutMessage', payload: { - // i18n key + values are picked up by the pad-side renderer (Task 14). - message: {message: key, values, sticky: false}, + message: {messageKey: key, values, sticky: false}, timestamp: Date.now(), }, }, diff --git a/src/static/js/pad.ts b/src/static/js/pad.ts index 6070fb8944f..26234bfef17 100644 --- a/src/static/js/pad.ts +++ b/src/static/js/pad.ts @@ -401,13 +401,19 @@ const handshake = async () => { // gritter so the user doesn't see a confusing duplicate. if (typeof msgObj.messageKey === 'string' && msgObj.messageKey.startsWith('pad.deletionToken.')) return; - const text = msgObj.messageKey ? html10n.get(msgObj.messageKey) : msgObj.message; + // Updater drain announcements get their own title and dodge the generic + // "Admin message" framing so the user knows it's a system event. + const isUpdate = typeof msgObj.messageKey === 'string' + && msgObj.messageKey.startsWith('update.drain.'); + const text = msgObj.messageKey + ? html10n.get(msgObj.messageKey, msgObj.values || {}) + : msgObj.message; if (!text) return; const date = new Date(payload.timestamp); $.gritter.add({ - title: 'Admin message', + title: isUpdate ? html10n.get('update.banner.title') : 'Admin message', text: '[' + date.toLocaleTimeString() + ']: ' + text, - sticky: msgObj.sticky + sticky: !!msgObj.sticky }); } }) From db49a20b00a8667f83d4d624a18201675ca7ce44 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:10:44 +0100 Subject: [PATCH 14/21] feat(updater): rollback uses git checkout -f + integration suite over tmp git repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RollbackHandler now does git checkout -f BEFORE overlaying the backup lockfile. Without -f, git refuses checkout when there are unstaged modifications to files it would overwrite — exactly the case after a partial executor run that mutated the working tree. With -f the partial mutation is discarded and the working tree returns to fromSha cleanly. The backup-lockfile copy is still done (belt-and-braces) but tolerates ENOENT since checkout already restored the right lockfile. The new integration suite at src/tests/backend/specs/updater-integration.ts exercises the full pipeline against a disposable git repo: happy path, install-fail rollback, build-fail rollback, crash-loop guard, and a target-sha-doesn't-exist rollback-failed terminal case. 5 mocha tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/updater/RollbackHandler.ts | 23 +- .../backend/specs/updater-integration.ts | 265 ++++++++++++++++++ 2 files changed, 283 insertions(+), 5 deletions(-) create mode 100644 src/tests/backend/specs/updater-integration.ts diff --git a/src/node/updater/RollbackHandler.ts b/src/node/updater/RollbackHandler.ts index 59cb9b6d002..9cea0f359bf 100644 --- a/src/node/updater/RollbackHandler.ts +++ b/src/node/updater/RollbackHandler.ts @@ -91,18 +91,31 @@ export const performRollback = async (state: UpdateState, deps: RollbackDeps): P deps.exit(75); }; + // Force-checkout first so any partial mutation from the failed executor run + // (rewritten lockfile, half-installed modules) is discarded. -f overwrites + // tracked files from the target tree's index — without it, `git checkout` + // refuses when there are unstaged modifications to files it would replace. + const checkoutCode = await runStep( + deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', '-f', fromSha]); + if (checkoutCode !== 0) return failTerminal(`git checkout -f ${fromSha} exit ${checkoutCode}`); + + // Now overlay the backed-up lockfile on top. Belt-and-braces: a force + // checkout already restored the lockfile to the target SHA's version; the + // backup wins on the rare case where the running install had a hand-edited + // lockfile we want to preserve. try { await deps.copyFile( path.join(deps.backupDir, 'pnpm-lock.yaml'), path.join(deps.repoDir, 'pnpm-lock.yaml'), ); - } catch (err) { - return failTerminal(`copy lockfile: ${(err as Error).message}`); + } catch (err: any) { + // ENOENT on the backup is acceptable — the force checkout already + // restored the right lockfile from the index. + if (err?.code !== 'ENOENT') { + return failTerminal(`copy lockfile: ${(err as Error).message}`); + } } - const checkoutCode = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', fromSha]); - if (checkoutCode !== 0) return failTerminal(`git checkout ${fromSha} exit ${checkoutCode}`); - const installCode = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); if (installCode !== 0) return failTerminal(`pnpm install exit ${installCode}`); diff --git a/src/tests/backend/specs/updater-integration.ts b/src/tests/backend/specs/updater-integration.ts new file mode 100644 index 00000000000..e04a551c258 --- /dev/null +++ b/src/tests/backend/specs/updater-integration.ts @@ -0,0 +1,265 @@ +'use strict'; + +const assert = require('assert').strict; +import {execSync, spawn} from 'node:child_process'; +import fs from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import {executeUpdate} from '../../../node/updater/UpdateExecutor'; +import {performRollback, checkPendingVerification} from '../../../node/updater/RollbackHandler'; +import {EMPTY_STATE, UpdateState} from '../../../node/updater/types'; + +const sh = (cmd: string, opts: any = {}) => + execSync(cmd, {stdio: 'pipe', ...opts}).toString().trim(); + +const buildTmpRepo = async (): Promise<{dir: string; v1Sha: string; v2Sha: string}> => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'updater-it-')); + sh('git init -b main', {cwd: dir}); + sh('git config user.email test@example.com', {cwd: dir}); + sh('git config user.name test', {cwd: dir}); + sh('git config commit.gpgsign false', {cwd: dir}); + sh('git config tag.gpgSign false', {cwd: dir}); + await fs.writeFile(path.join(dir, 'pnpm-lock.yaml'), 'lockfileVersion: x\n'); + sh('git add . && git commit -m initial', {cwd: dir}); + sh('git tag v0.0.1', {cwd: dir}); + const v1Sha = sh('git rev-parse HEAD', {cwd: dir}); + await fs.writeFile(path.join(dir, 'pnpm-lock.yaml'), 'lockfileVersion: y\n'); + sh('git add . && git commit -m bump', {cwd: dir}); + sh('git tag v0.0.2', {cwd: dir}); + const v2Sha = sh('git rev-parse HEAD', {cwd: dir}); + // Reset to v1 — that's our "currently installed" version. + sh('git checkout v0.0.1', {cwd: dir}); + // Add a self-pointing origin so executor's git fetch works. + sh(`git remote add origin ${dir}`, {cwd: dir}); + // Pre-prime origin's tag list (git fetch from a local origin sees both). + return {dir, v1Sha, v2Sha}; +}; + +/** + * Spawn override: route every git ... call to the real binary, but stub pnpm + * to a controlled exit code. Lets tests assert "git fetch + checkout actually + * mutated the repo" without ever invoking pnpm install for real. + */ +const stubSpawn = (pnpmExits: Record) => + (cmd: string, args: string[], opts: any) => { + if (cmd === 'pnpm') { + const key = `pnpm ${args.join(' ')}`; + const exit = pnpmExits[key]; + if (exit === undefined) { + throw new Error(`Unexpected pnpm call in integration stub: ${key}`); + } + return { + stdout: {on: () => {}}, + stderr: {on: () => {}}, + on: (e: string, cb: any) => { if (e === 'close') setImmediate(() => cb(exit)); }, + }; + } + return spawn(cmd, args, opts); + }; + +describe(__filename, function () { + this.timeout(30_000); + + it('happy path: executes against tmp repo, lands on pending-verification, exits 75', async () => { + const {dir, v1Sha} = await buildTmpRepo(); + try { + const states: UpdateState[] = []; + let exitedWith: number | null = null; + const r = await executeUpdate({ + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({ + 'pnpm install --frozen-lockfile': 0, + 'pnpm run build:ui': 0, + }) as any, + readSha: async () => sh('git rev-parse HEAD', {cwd: dir}), + copyFile: async (s, d) => { + await fs.mkdir(path.dirname(d), {recursive: true}); + await fs.copyFile(s, d); + }, + saveState: async (s) => { states.push(structuredClone(s)); }, + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v0.0.2', + now: () => new Date(), + exit: (code) => { exitedWith = code; }, + }); + assert.equal(r.outcome, 'pending-verification'); + assert.equal(exitedWith, 75); + assert.equal(states.at(-1)!.execution.status, 'pending-verification'); + // Working tree is now on v0.0.2. + assert.equal(sh('git rev-parse HEAD', {cwd: dir}), sh('git rev-parse v0.0.2', {cwd: dir})); + // Backup has the v0.0.1-era lockfile. + const backup = await fs.readFile(path.join(dir, 'var', 'update-backup', 'pnpm-lock.yaml'), 'utf8'); + assert.match(backup, /lockfileVersion: x/); + // The fromSha recorded in state matches the v0.0.1 SHA. + assert.equal((states.at(-1)!.execution as {fromSha: string}).fromSha, v1Sha); + } finally { + await fs.rm(dir, {recursive: true, force: true}); + } + }); + + it('install failure rolls back to original SHA + lockfile', async () => { + const {dir, v1Sha} = await buildTmpRepo(); + try { + const states: UpdateState[] = []; + let exitedWith: number | null = null; + + // Phase 1: executor with failing install. + await executeUpdate({ + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 1}) as any, + readSha: async () => sh('git rev-parse HEAD', {cwd: dir}), + copyFile: async (s, d) => { + await fs.mkdir(path.dirname(d), {recursive: true}); + await fs.copyFile(s, d); + }, + saveState: async (s) => { states.push(structuredClone(s)); }, + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v0.0.2', + now: () => new Date(), + exit: (c) => { exitedWith = c; }, + }); + assert.equal(states.at(-1)!.execution.status, 'rolling-back'); + + // Phase 2: rollback. + await performRollback(states.at(-1)!, { + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0}) as any, + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: (c) => { exitedWith = c; }, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(states.at(-1)!.execution.status, 'rolled-back'); + assert.equal(sh('git rev-parse HEAD', {cwd: dir}), v1Sha); + assert.equal(exitedWith, 75); + // Working tree's pnpm-lock.yaml was restored from backup. + const lock = await fs.readFile(path.join(dir, 'pnpm-lock.yaml'), 'utf8'); + assert.match(lock, /lockfileVersion: x/); + } finally { + await fs.rm(dir, {recursive: true, force: true}); + } + }); + + it('build failure rolls back to original SHA', async () => { + const {dir, v1Sha} = await buildTmpRepo(); + try { + const states: UpdateState[] = []; + + await executeUpdate({ + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({ + 'pnpm install --frozen-lockfile': 0, + 'pnpm run build:ui': 1, + }) as any, + readSha: async () => sh('git rev-parse HEAD', {cwd: dir}), + copyFile: async (s, d) => { + await fs.mkdir(path.dirname(d), {recursive: true}); + await fs.copyFile(s, d); + }, + saveState: async (s) => { states.push(structuredClone(s)); }, + initialState: structuredClone(EMPTY_STATE), + targetTag: 'v0.0.2', + now: () => new Date(), + exit: () => {}, + }); + assert.equal(states.at(-1)!.execution.status, 'rolling-back'); + + await performRollback(states.at(-1)!, { + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0}) as any, + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: () => {}, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(states.at(-1)!.execution.status, 'rolled-back'); + assert.equal(sh('git rev-parse HEAD', {cwd: dir}), v1Sha); + } finally { + await fs.rm(dir, {recursive: true, force: true}); + } + }); + + it('crash-loop guard: bootCount=3 forces immediate rollback', async () => { + const {dir, v1Sha} = await buildTmpRepo(); + try { + // Simulate "post-update boot": working tree on v0.0.2, backup lockfile from v0.0.1 + // already in place, state is pending-verification with bootCount=3. + sh('git checkout v0.0.2', {cwd: dir}); + await fs.mkdir(path.join(dir, 'var', 'update-backup'), {recursive: true}); + // Backup the v0.0.1 lockfile content (we know v0.0.1's lockfile was 'x' from buildTmpRepo). + await fs.writeFile(path.join(dir, 'var', 'update-backup', 'pnpm-lock.yaml'), 'lockfileVersion: x\n'); + + const states: UpdateState[] = []; + let exitedWith: number | null = null; + const state: UpdateState = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'pending-verification', + targetTag: 'v0.0.2', + fromSha: v1Sha, + deadlineAt: '2026-05-08T10:00:00Z', + }, + bootCount: 3, + }; + const r = checkPendingVerification(state, { + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0}) as any, + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: (c) => { exitedWith = c; }, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(r.armed, false); + // Wait for the fire-and-forget rollback to finish. + await new Promise((resolve) => setTimeout(resolve, 250)); + assert.equal(states.at(-1)!.execution.status, 'rolled-back'); + assert.equal(sh('git rev-parse HEAD', {cwd: dir}), v1Sha); + assert.equal(exitedWith, 75); + } finally { + await fs.rm(dir, {recursive: true, force: true}); + } + }); + + it('rollback failure (target SHA does not exist) lands on terminal rollback-failed', async () => { + const {dir} = await buildTmpRepo(); + try { + const states: UpdateState[] = []; + let exitedWith: number | null = null; + const state: UpdateState = { + ...structuredClone(EMPTY_STATE), + execution: { + status: 'rolling-back', + reason: 'install-failed', + targetTag: 'v0.0.2', + // 40 hex chars but no such commit — git checkout -f will reject. + fromSha: '0000000000000000000000000000000000000000', + at: '2026-05-08T10:00:00Z', + }, + }; + await performRollback(state, { + repoDir: dir, + backupDir: path.join(dir, 'var', 'update-backup'), + spawnFn: stubSpawn({'pnpm install --frozen-lockfile': 0}) as any, + copyFile: (s, d) => fs.copyFile(s, d), + saveState: async (s) => { states.push(structuredClone(s)); }, + exit: (c) => { exitedWith = c; }, + now: () => new Date(), + rollbackHealthCheckSeconds: 60, + }); + assert.equal(states.at(-1)!.execution.status, 'rollback-failed'); + assert.equal(states.at(-1)!.lastResult!.outcome, 'rollback-failed'); + assert.equal(exitedWith, 75); + } finally { + await fs.rm(dir, {recursive: true, force: true}); + } + }); +}); From 6107c1e45a018beb75ba98c09814242be46a9b1d Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:11:36 +0100 Subject: [PATCH 15/21] test(updater): Playwright admin Apply / Cancel / Acknowledge flow Stubs /admin/update/status (and /admin/update/apply for the apply path) at the route level so we can assert UI transitions without actually running an update. Four scenarios: - Apply button POSTs and re-fetches status (>=2 status fetches total). - install-method-not-writable hides the button and shows localised denial copy. - rollback-failed terminal state shows the Acknowledge button and the "Manual intervention required" lastResult copy. - lockHeld=true hides Apply even when policy.canManual is on. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../admin-spec/update-page-actions.spec.ts | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 src/tests/frontend-new/admin-spec/update-page-actions.spec.ts diff --git a/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts b/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts new file mode 100644 index 00000000000..e52e6539e71 --- /dev/null +++ b/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts @@ -0,0 +1,108 @@ +import {expect, test} from '@playwright/test'; +import {loginToAdmin} from '../helper/adminhelper'; + +const baseStatus = { + currentVersion: '2.7.1', + latest: { + version: '2.7.2', + tag: 'v2.7.2', + body: 'release notes', + publishedAt: '2026-05-01T00:00:00Z', + prerelease: false, + htmlUrl: 'https://github.com/ether/etherpad/releases/tag/v2.7.2', + }, + lastCheckAt: '2026-05-08T00:00:00Z', + installMethod: 'git', + tier: 'manual', + policy: {canNotify: true, canManual: true, canAuto: false, canAutonomous: false, reason: 'ok'}, + vulnerableBelow: [], + execution: {status: 'idle'}, + lastResult: null, + lockHeld: false, +}; + +test.describe('admin update page actions', () => { + test.beforeEach(async ({page}) => { + await loginToAdmin(page, 'admin', 'changeme1'); + }); + + test('Apply button posts /admin/update/apply and re-fetches status', async ({page}) => { + let postedApply = false; + let statusFetches = 0; + await page.route('**/admin/update/status', async (route) => { + statusFetches += 1; + await route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify(baseStatus), + }); + }); + await page.route('**/admin/update/apply', async (route) => { + postedApply = true; + await route.fulfill({status: 202, contentType: 'application/json', body: JSON.stringify({accepted: true})}); + }); + + await page.goto('http://localhost:9001/admin/update'); + await expect(page.getByRole('button', {name: /apply update/i})).toBeVisible({timeout: 30000}); + + await page.getByRole('button', {name: /apply update/i}).click(); + await expect.poll(() => postedApply, {timeout: 15000}).toBe(true); + // After Apply, the page re-fetches status. Initial load = 1 fetch + Apply re-fetch >= 2. + await expect.poll(() => statusFetches, {timeout: 15000}).toBeGreaterThanOrEqual(2); + }); + + test('install-method-not-writable hides Apply and shows the policy-denial copy', async ({page}) => { + const denied = { + ...baseStatus, + installMethod: 'docker', + policy: {canNotify: true, canManual: false, canAuto: false, canAutonomous: false, reason: 'install-method-not-writable'}, + }; + await page.route('**/admin/update/status', (route) => + route.fulfill({status: 200, contentType: 'application/json', body: JSON.stringify(denied)})); + + await page.goto('http://localhost:9001/admin/update'); + // Heading rendered; no Apply button. + await expect(page.getByRole('heading', {name: /etherpad updates/i})).toBeVisible({timeout: 30000}); + await expect(page.getByRole('button', {name: /apply update/i})).toHaveCount(0); + // Localised denial copy. + await expect(page.getByText(/Updates from the admin UI require a git install/i)).toBeVisible(); + }); + + test('rollback-failed terminal state shows Acknowledge and lastResult copy', async ({page}) => { + const terminal = { + ...baseStatus, + execution: { + status: 'rollback-failed', + reason: 'pnpm install failed; rollback failed: pnpm exit 1', + targetTag: 'v2.7.2', + fromSha: 'abc', + at: '2026-05-08T00:00:00Z', + }, + lastResult: { + targetTag: 'v2.7.2', + fromSha: 'abc', + outcome: 'rollback-failed', + reason: 'pnpm install failed', + at: '2026-05-08T00:00:00Z', + }, + policy: {canNotify: true, canManual: true, canAuto: false, canAutonomous: false, reason: 'rollback-failed-terminal'}, + }; + await page.route('**/admin/update/status', (route) => + route.fulfill({status: 200, contentType: 'application/json', body: JSON.stringify(terminal)})); + + await page.goto('http://localhost:9001/admin/update'); + await expect(page.getByRole('button', {name: /acknowledge/i})).toBeVisible({timeout: 30000}); + // lastResult copy uses i18n update.page.last_result.rollback-failed. + await expect(page.getByText(/Manual intervention required/i)).toBeVisible(); + }); + + test('lockHeld true hides the Apply button even when policy.canManual is on', async ({page}) => { + const locked = {...baseStatus, lockHeld: true}; + await page.route('**/admin/update/status', (route) => + route.fulfill({status: 200, contentType: 'application/json', body: JSON.stringify(locked)})); + + await page.goto('http://localhost:9001/admin/update'); + await expect(page.getByRole('heading', {name: /etherpad updates/i})).toBeVisible({timeout: 30000}); + await expect(page.getByRole('button', {name: /apply update/i})).toHaveCount(0); + }); +}); From a28f939e5641cd9ec52ff22d0bedcd2b736d3cdd Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:12:09 +0100 Subject: [PATCH 16/21] feat(updater): admin banner shows rollback-failed terminal alert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When execution.status === 'rollback-failed' the banner switches to a role=alert with the strong update.banner.terminal.rollback-failed copy and overrides the regular "update available" framing — an admin who left the system in this state needs to fix it before any other admin work matters. Other terminal states (preflight-failed, rolled-back) are informational and surface on the page itself, not the banner. Co-Authored-By: Claude Opus 4.7 (1M context) --- admin/src/components/UpdateBanner.tsx | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/admin/src/components/UpdateBanner.tsx b/admin/src/components/UpdateBanner.tsx index 36f1faddc29..e69e89c2625 100644 --- a/admin/src/components/UpdateBanner.tsx +++ b/admin/src/components/UpdateBanner.tsx @@ -17,7 +17,21 @@ export const UpdateBanner = () => { return () => { cancelled = true; }; }, [setUpdateStatus]); - if (!updateStatus || !updateStatus.latest) return null; + if (!updateStatus) return null; + + // Terminal rollback-failed wins over the regular "update available" banner — + // an admin who left the system in this state needs to fix it before any + // other admin work matters. + if (updateStatus.execution?.status === 'rollback-failed') { + return ( +
+ {' '} + {t('update.banner.cta')} +
+ ); + } + + if (!updateStatus.latest) return null; if (updateStatus.currentVersion === updateStatus.latest.version) return null; return ( From a43f70708bc3e366e718ef9443e831bc62146457 Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:14:59 +0100 Subject: [PATCH 17/21] docs(updater): Tier 2 admin docs + manual smoke runbook + CHANGELOG doc/admin/updates.md gains a full Tier 2 section: prerequisites (git install + process supervisor with sample systemd unit), Apply flow with timings, every failure mode and the resulting state, the four endpoints, and the signature-verification opt-in. Settings table picks up the new updates.* knobs. docs/superpowers/specs/2026-04-25-auto-update-runbook.md is the manual smoke runbook the design spec calls for: disposable VM, systemd unit, every observable transition (happy path, install/ build-fail rollback, crash-loop guard, rollback-failed terminal, cancel during drain) plus a sign-off checklist for the release cut. CHANGELOG Unreleased section explains the supervisor requirement and points readers at the runbook. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 12 ++ doc/admin/updates.md | 76 ++++++- .../specs/2026-04-25-auto-update-runbook.md | 202 ++++++++++++++++++ 3 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 docs/superpowers/specs/2026-04-25-auto-update-runbook.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 8541a34b129..346603d950d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +# Unreleased + +### Notable enhancements + +- **Self-update subsystem — Tier 2 (manual click).** + - Admins on a git install can click "Apply update" at `/admin/update`. Etherpad runs a 60s session drain (with T-60 / T-30 / T-10 broadcasts to every pad), `git fetch / checkout / pnpm install --frozen-lockfile / pnpm run build:ui`, and exits with code 75 so a process supervisor restarts it on the new version. The next boot runs a 60s health check; if `/health` doesn't come up the previous SHA + lockfile are restored automatically. + - Crash-loop guard: if the new version reboots more than twice without the health check completing, RollbackHandler forces a rollback regardless of the timer. + - Terminal `rollback-failed` state surfaces a strong banner; the admin clicks Acknowledge once they've manually recovered to clear the lock and re-allow Tier 2 attempts. + - New settings under `updates.*`: `preApplyGraceMinutes`, `drainSeconds`, `rollbackHealthCheckSeconds`, `diskSpaceMinMB`, `requireSignature`, `trustedKeysPath`. Tag signature verification is opt-in (default `false`) — see `doc/admin/updates.md` for the keyring setup. + - **A process supervisor (systemd / pm2 / docker `--restart=unless-stopped`) is required to apply updates.** Without one, exit 75 leaves the instance down. + - Tiers 3 (auto with grace window) and 4 (autonomous in maintenance window) remain designed but unimplemented and will land in subsequent releases. + # 2.7.3 ### Breaking changes diff --git a/doc/admin/updates.md b/doc/admin/updates.md index 852912de3d9..053bbaeb8c4 100644 --- a/doc/admin/updates.md +++ b/doc/admin/updates.md @@ -1,8 +1,11 @@ # Etherpad updates -Etherpad ships with a built-in update subsystem. **Tier 1 (notify)** is enabled by default: a banner appears in the admin UI when a new release is available, and pad users see a discreet badge if the running version is severely outdated or flagged as vulnerable. No automatic execution happens at this tier — admins are simply informed. +Etherpad ships with a built-in update subsystem. -Tiers 2 (manual click), 3 (auto with grace window), and 4 (autonomous in maintenance window) are designed but not yet implemented. They will land in subsequent releases. +- **Tier 1 (notify)** — default. A banner appears in the admin UI when a new release is available, and pad users see a discreet badge if the running version is severely outdated or flagged as vulnerable. No execution. +- **Tier 2 (manual click)** — admins on a git install can click "Apply update" at `/admin/update`. Etherpad drains active sessions, runs `git fetch / checkout / pnpm install / pnpm run build:ui`, and exits with code 75 so a process supervisor restarts it on the new version. Auto-rolls back on failure. +- **Tier 3 (auto with grace window)** — designed, not yet implemented. +- **Tier 4 (autonomous in maintenance window)** — designed, not yet implemented. ## Settings @@ -17,7 +20,14 @@ In `settings.json`: "installMethod": "auto", "checkIntervalHours": 6, "githubRepo": "ether/etherpad", - "requireAdminForStatus": false + "requireAdminForStatus": false, + // Tier 2+ knobs (only meaningful at tier "manual" or higher): + "preApplyGraceMinutes": 0, + "drainSeconds": 60, + "rollbackHealthCheckSeconds": 60, + "diskSpaceMinMB": 500, + "requireSignature": false, + "trustedKeysPath": null }, "adminEmail": null } @@ -32,6 +42,12 @@ In `settings.json`: | `updates.checkIntervalHours` | `6` | How often to poll GitHub Releases. | | `updates.githubRepo` | `"ether/etherpad"` | Override for forks. | | `updates.requireAdminForStatus` | `false` | Lock the `/admin/update/status` endpoint to authenticated admin sessions. Default `false` matches existing Etherpad behavior — `/health` already exposes `releaseId` publicly, and changelog data comes from a public GitHub release. Set `true` to hide the full update payload from non-admins without disabling the updater (`tier: "off"` is the heavier opt-out that removes the endpoints entirely). | +| `updates.preApplyGraceMinutes` | `0` | **Tier 3 only.** Wait this many minutes between detecting a new release and starting the drain so the admin can cancel. Has no effect at tier `"manual"`. | +| `updates.drainSeconds` | `60` | How long to broadcast "restart imminent" announcements to active pads before exiting. T-60 / T-30 / T-10 broadcasts fire automatically at the matching offsets within this window. | +| `updates.rollbackHealthCheckSeconds` | `60` | After a fresh boot post-update, give `/health` this long to come up. If it doesn't, RollbackHandler restores the previous SHA. | +| `updates.diskSpaceMinMB` | `500` | Pre-flight refuses to start an update unless the install volume has at least this many MB free. | +| `updates.requireSignature` | `false` | When `true`, refuse updates whose tag is not signed by a trusted key. Verification is done via `git verify-tag ` against the user's GPG keyring. Default `false` because Etherpad's release process does not yet sign tags consistently — turning the check on by default would block every Tier 2 update. Set `true` if you run your own builds or have imported a fork's keys. | +| `updates.trustedKeysPath` | `null` | Override the keyring location passed to `git verify-tag` via the `$GNUPGHOME` env var. Useful when the trusted keys live in a dedicated keyring outside the Etherpad user's home. Only meaningful when `requireSignature: true`. | | `adminEmail` | `null` | Top-level. Contact for admin notifications. Setting it enables the email nudges below. | ## What "outdated" means @@ -81,3 +97,57 @@ The version check sends no telemetry. Etherpad fetches the public GitHub Release Set the value explicitly if the heuristics get it wrong (e.g., a docker container that bind-mounts a writable git checkout). In PR 1 (notify only) the install method does not change behavior — every install method gets the banner. From PR 2 onward the install method gates whether the manual-click and automatic tiers can run; only `"git"` is initially supported for write tiers. + +## Tier 2 — manual click + +Tier 2 is opt-in. To enable: set `updates.tier: "manual"` and ensure your install was deployed via git (not docker / npm / managed package). + +### Process supervisor is required + +Etherpad applies an update by **exiting with code 75** so a process supervisor restarts it. Without a supervisor the instance simply exits and stays down. Common supervisor setups: + +- **systemd:** add `Restart=on-failure` + `RestartSec=5` to your unit file. +- **pm2:** the default behaviour restarts on exit. +- **docker:** add `--restart=unless-stopped` (Tier 2 itself is not supported on docker installs anyway, but if you wrap your own image around a git checkout this applies). + +### What clicking "Apply update" does + +1. **Lock acquire** — `var/update.lock` (PID-based, stale locks reaped automatically). +2. **Pre-flight checks** — install method writable, working tree clean, free disk ≥ `diskSpaceMinMB`, `pnpm` on `PATH`, target tag exists at the configured remote, signature verifies (if `requireSignature: true`). On failure, state goes to `preflight-failed` with a typed reason; the admin sees a banner and clicks **Acknowledge** to clear it. No filesystem mutation has happened — nothing to roll back. +3. **Drain** — `drainSeconds` window during which T-60 / T-30 / T-10 announcements broadcast to every connected pad and new socket connections are refused. Click **Cancel** during this window to abort cleanly. +4. **Execute** — `git fetch --tags origin`, `git checkout `, `pnpm install --frozen-lockfile`, `pnpm run build:ui`. Output streams to `var/log/update.log` (rotated 10 MB × 5). +5. **Exit 75** — the supervisor restarts on the new version. +6. **Health check** — RollbackHandler arms a `rollbackHealthCheckSeconds` timer at boot. When `/health` responds 200 (i.e., Etherpad reaches the `RUNNING` state) the timer cancels and the state lands on `verified`. + +### Failure modes + +| What went wrong | Resulting state | Admin action | +| --- | --- | --- | +| Pre-flight check fails | `preflight-failed` | Click **Acknowledge** after fixing the underlying issue (free up disk, clean working tree, etc.). | +| `git fetch` / `git checkout` fails mid-flow | `rolled-back` | Informational. The working tree is back where it started; click **Acknowledge** to clear. | +| `pnpm install` or `pnpm run build:ui` fails | `rolled-back` | Same as above. The lockfile and SHA are restored. | +| `/health` doesn't come up within `rollbackHealthCheckSeconds` | `rolled-back` | Same — RollbackHandler restores the previous SHA + lockfile and exits 75 again. | +| The new version crashes at boot more than twice (`bootCount > 2`) | `rolled-back` | Crash-loop guard kicks in regardless of the health-check timer. | +| Rollback itself fails (e.g., `pnpm install` errors restoring old lockfile) | `rollback-failed` | **Manual intervention required.** The admin banner switches to a strong red alert. Restore the install by hand, then click **Acknowledge** to clear the lock and re-allow Tier 2 attempts. | + +### Endpoints + +All Tier 2 endpoints require an authenticated admin session (`is_admin: true`) regardless of `requireAdminForStatus`. + +- `POST /admin/update/apply` — start an apply. Returns `202 {accepted, drainEndsAt}` once the drain begins. Body unused. +- `POST /admin/update/cancel` — cancel during pre-flight or drain. Returns `409` once the executor has begun mutating the filesystem (state machine guarantees we either complete or roll back from there). +- `POST /admin/update/acknowledge` — clear a terminal `preflight-failed` / `rolled-back` / `rollback-failed` state back to `idle`. +- `GET /admin/update/log` — tail the last 200 lines of `var/log/update.log`. Plain text. Used by the in-progress UI. + +### Signature verification + +Default off. Etherpad releases are not yet consistently signed; turning verification on by default would block every Tier 2 update. To enable: + +```jsonc +"updates": { + "requireSignature": true, + "trustedKeysPath": "/srv/etherpad/keys" // optional — defaults to the OS user keyring +} +``` + +The check shells out to `git verify-tag `. The keyring at `trustedKeysPath` is passed to git via `GNUPGHOME`. If `trustedKeysPath` is `null` (default), the OS user's default keyring is used. diff --git a/docs/superpowers/specs/2026-04-25-auto-update-runbook.md b/docs/superpowers/specs/2026-04-25-auto-update-runbook.md new file mode 100644 index 00000000000..84b14ff3772 --- /dev/null +++ b/docs/superpowers/specs/2026-04-25-auto-update-runbook.md @@ -0,0 +1,202 @@ +# Etherpad Auto-Update — Manual Smoke Runbook + +**Status:** required gate before each tier ships, per `2026-04-25-auto-update-design.md` § "Phased rollout". +**Audience:** the engineer cutting a release that includes new updater code. +**Time budget:** ~30–40 minutes for the full sweep against a disposable VM. + +This runbook exercises the failure paths that unit and integration tests cannot reach: a real process supervisor, a real `pnpm install` run, real session drain broadcasts to a real pad client. Run it on a throw-away VM you don't mind nuking. + +## 0. Provision a disposable VM + +Anything Linux works; the example below uses Debian/Ubuntu under systemd. + +```bash +# On the VM +sudo adduser --system --group --home /srv/etherpad --shell /bin/bash etherpad +sudo apt update && sudo apt install -y git nodejs ca-certificates +# Etherpad's pnpm comes from corepack — Node 22+ ships it. +sudo -u etherpad bash -c ' + cd /srv/etherpad + git clone https://github.com/ether/etherpad.git current + cd current + corepack enable && corepack prepare pnpm@latest-9 --activate + pnpm install + pnpm run build:ui +' +``` + +## 1. Install Etherpad as a systemd service + +`/etc/systemd/system/etherpad.service`: + +```ini +[Unit] +Description=Etherpad +After=network.target + +[Service] +Type=simple +User=etherpad +WorkingDirectory=/srv/etherpad/current +ExecStart=/usr/bin/pnpm run dev +Restart=on-failure +RestartSec=5 +SuccessExitStatus=75 +# Treat exit 75 as "intentional" so systemd doesn't escalate-restart counters. + +[Install] +WantedBy=multi-user.target +``` + +Then: + +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now etherpad +journalctl -u etherpad -f & # tail the log in another terminal +``` + +## 2. Configure for Tier 2 + +Edit `/srv/etherpad/current/settings.json` and set: + +```jsonc +{ + "updates": { + "tier": "manual", + "checkIntervalHours": 1, + "drainSeconds": 30, // shorten the wait during smoke testing + "rollbackHealthCheckSeconds": 30 + } +} +``` + +`sudo systemctl restart etherpad`. Visit `http://:9001/admin/update` and log in as the admin user from `settings.json`. + +## 3. Force "an update is available" + +The simplest way: `git checkout` to a commit *before* a tagged release. + +```bash +sudo -u etherpad bash -c 'cd /srv/etherpad/current && git checkout v2.7.2' +sudo systemctl restart etherpad +``` + +Trigger an immediate version check (or wait an hour): + +```bash +curl -fsSL http://localhost:9001/admin/update/status | jq . +# Expect: latest.version newer than currentVersion, policy.canManual=true +``` + +The admin UI banner should now read **"Update available"**, and `/admin/update` should show an **"Apply update"** button. + +## 4. Happy path: apply, drain, restart, verify + +1. Open a pad in another browser tab (`http://:9001/p/test`). +2. Click **Apply update** on `/admin/update`. +3. **Within 30 seconds** confirm: + - The pad shows a gritter notification "Etherpad will restart in 30 seconds…" (i18n string from `update.drain.t30`), then `update.drain.t10`. + - The page polls `/admin/update/log`; the `
` block fills with `git fetch / checkout / pnpm install / pnpm run build:ui` output.
+4. systemd journal shows `update executed:  -> ; exiting 75 for supervisor restart`.
+5. systemd restarts the unit (~5s under `RestartSec`).
+6. Reload `/admin/update`. State should be **`verified`** with `lastResult.outcome: "verified"`.
+
+**Sign-off:** every observable transition matches the state machine in the design spec § "State machine". If any step lingers or the page shows a different status, capture `var/log/update.log` and stop.
+
+## 5. Rollback path: install failure
+
+Force a rollback by giving pnpm something it can't resolve.
+
+```bash
+# As etherpad user, in /srv/etherpad/current:
+git checkout v2.7.2
+echo 'lockfileVersion: this-is-not-real-content' >> pnpm-lock.yaml
+sudo systemctl restart etherpad
+```
+
+Visit `/admin/update` and click Apply.
+
+Expected:
+
+- Drain announcement on the pad as before.
+- Log shows `pnpm install --frozen-lockfile` exiting non-zero.
+- State goes through `rolling-back` → `rolled-back`.
+- After supervisor restart, `/admin/update` shows the **rolled-back** banner with `lastResult.reason` describing the install failure.
+- `git rev-parse HEAD` matches the pre-update SHA.
+- Click **Acknowledge** to clear the lastResult banner.
+
+## 6. Rollback path: build failure
+
+```bash
+git checkout v2.7.2
+# Break the build by introducing a syntax error:
+echo 'this is not valid TypeScript' >> src/static/js/pad.ts
+sudo systemctl restart etherpad   # confirm the broken tree still serves; we want apply to fail at build:ui, not at boot
+```
+
+Apply, observe `pnpm run build:ui` exit non-zero in the log, observe `rolling-back` → `rolled-back`. Working tree restored.
+
+Revert the syntax error before continuing.
+
+## 7. Crash-loop guard
+
+Force the new version to crash at boot more than twice. Easiest:
+
+```bash
+# As etherpad user:
+git checkout v2.7.2
+# Apply to v2.7.3, but during the apply window introduce a startup error:
+# (Edit src/node/server.ts in the v2.7.3 tag's worktree to throw immediately.)
+```
+
+Click Apply. The new boot crashes; systemd restarts; RollbackHandler increments `bootCount`. After three crashes, `bootCount > 2` triggers a forced rollback regardless of the health-check timer.
+
+Observe state lands on `rolled-back` with `reason: "health-check-failed-or-crash-loop"`. Working tree on the original SHA.
+
+## 8. Rollback-failed terminal state
+
+Hardest to set up; force `pnpm install` to fail on the rollback path too.
+
+```bash
+# Trigger a normal install-failed rollback (step 5), but BEFORE it runs the
+# rollback step, corrupt the backup lockfile:
+echo garbage > /srv/etherpad/current/var/update-backup/pnpm-lock.yaml
+# … or remove the etherpad user's permission to the install dir mid-flow.
+```
+
+Expected:
+
+- State lands on **`rollback-failed`**.
+- `/admin/update` shows the strong red banner (role=alert) with the
+  `update.banner.terminal.rollback-failed` copy.
+- `policy.canManual` stays true; `policy.canAuto` is false (terminal-blocked).
+- Manually fix the install (restore the lockfile, fix permissions), then
+  click **Acknowledge**. State returns to `idle` and Apply re-enables.
+
+## 9. Cancel during drain
+
+Click Apply. Within 30s, click Cancel.
+
+Expected:
+
+- Drain timers stop firing immediately.
+- State returns to `idle`.
+- `lastResult.outcome: "cancelled"`.
+- `var/update.lock` is gone.
+- No exit; systemd doesn't restart.
+
+## 10. Sign-off checklist
+
+Tick every line before approving the release that introduces this code:
+
+- [ ] Happy path lands on `verified` with the working tree on the new tag.
+- [ ] Install-fail and build-fail rollbacks restore the previous SHA.
+- [ ] Crash-loop guard forces rollback at `bootCount > 2`.
+- [ ] `rollback-failed` shows the strong banner and Acknowledge clears it.
+- [ ] Cancel during drain leaves no lock, returns to `idle`.
+- [ ] Pad client renders the localised drain announcement (NOT the literal i18n key).
+- [ ] systemd journal shows no unhandled rejections, no orphaned processes.
+- [ ] `var/log/update.log` is rotated when it crosses 10 MB (force this by writing >10 MB into the file and triggering an Apply).
+
+If any line is unticked, do not ship the release.

From 2b041ccdac1ccf2e935ef16fb1c851ec12fb87df Mon Sep 17 00:00:00 2001
From: John McLear 
Date: Fri, 8 May 2026 13:24:31 +0100
Subject: [PATCH 18/21] docs(updater): note docker-friendly update flows as
 follow-up work

Tier 2 refuses Apply on installMethod=docker because in-container
mutation doesn't survive a container restart. Adds a future-work note
covering the two reasonable paths for an in-product docker Apply
button (instructions-only vs deploy-webhook) and explicitly rules out
mounting /var/run/docker.sock as a footgun. Watchtower gets a pointer
for admins who want fully autonomous docker updates today.

Co-Authored-By: Claude Opus 4.7 (1M context) 
---
 doc/admin/updates.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/admin/updates.md b/doc/admin/updates.md
index 053bbaeb8c4..ddafa889d26 100644
--- a/doc/admin/updates.md
+++ b/doc/admin/updates.md
@@ -151,3 +151,14 @@ Default off. Etherpad releases are not yet consistently signed; turning verifica
 ```
 
 The check shells out to `git verify-tag `. The keyring at `trustedKeysPath` is passed to git via `GNUPGHOME`. If `trustedKeysPath` is `null` (default), the OS user's default keyring is used.
+
+### Docker-friendly update flows (future work)
+
+Tier 2 deliberately refuses to apply on `installMethod: "docker"` because in-container `git fetch / pnpm install / build:ui` doesn't survive a container restart — the orchestrator brings the container back up on the same image tag and the work is lost. Docker installs stay on Tier 1 (banner + version status) for now.
+
+The right way to give docker admins an in-product Apply button is to delegate to the orchestrator rather than mutate the container. Two patterns to consider in a follow-up PR:
+
+- **Instructions-only.** When the page detects `installMethod: docker` *and* a newer release exists, swap the policy-denial copy for actionable instructions (`docker pull etherpad/etherpad:` for plain docker; `docker compose pull && docker compose up -d` for compose). Cheap, no new attack surface.
+- **Deploy webhook.** New setting `updates.dockerWebhook`. When set, the Apply button on a docker install POSTs to the configured URL and trusts the orchestrator (Render / Railway / Fly / Portainer / Coolify / GitHub Actions — they all expose redeploy webhooks) to do the actual pull-and-recreate.
+
+Direct Docker-socket access (mount `/var/run/docker.sock` into the container) is **out of scope** — anyone who escapes the Etherpad process via that socket gets root on the host. Admins who want fully autonomous docker updates should run [Watchtower](https://containrrr.dev/watchtower/) alongside Etherpad rather than bake equivalent privilege into Etherpad itself.

From 41d4a422acdd60c1f51ba44bc00f0616b7487bfe Mon Sep 17 00:00:00 2001
From: John McLear 
Date: Fri, 8 May 2026 13:34:49 +0100
Subject: [PATCH 19/21] fix(updater): address Qodo review (1-6) + Playwright
 strict-mode CI fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Tier 2 endpoints now gate on tier in {manual, auto, autonomous} —
   notify and off return 404 to match the prior PR-1 behaviour. Gate is
   evaluated per-request via app.use middleware so a settings.json reload
   takes effect without a full restart, and so integration tests can flip
   the tier dynamically. Adds a regression test that exercises 404 at
   tier=notify across all four endpoints.

2. cancel/apply race fixed: /admin/update/cancel no longer releases the
   lock — apply's finally block owns it for the request's lifetime. Apply
   now reloads state after preflight and aborts with 409 cancelled-during-
   preflight if execution.status is no longer 'preflight' for the same
   targetTag. Prevents a second apply from sneaking in while the first is
   still running its slow checks, and prevents the post-cancel apply from
   continuing into drain/execute.

3. SessionDrainer now restores acceptingConnections=true at drain
   completion (not just on cancel). The lock + persisted execution.status
   prevent a fresh apply from racing in — the in-memory flag was redundant
   safety that turned into a wedge if the executor threw post-drain. Adds
   a unit test asserting the flag is restored after natural drain end.

4. PadMessageHandler drain guard switched from socket.json.send (a
   socket.io v2/v3 API that may not exist on v4) to socket.emit('message',
   ...) for consistency with the other disconnect paths in the file.

5. Spawn 'error' handlers added to runStep helpers in UpdateExecutor and
   RollbackHandler, plus the gpg verify-tag spawn in trustedKeys. Without
   them, a missing/unexecutable binary leaves the promise hanging forever
   and the update flow stuck in-flight. SpawnFn type extended to allow
   on('error', ...) listeners cleanly. Spawn errors now resolve with code
   1 + the error message in stderr, so the existing failure-detection
   branches fire normally.

6. executeUpdate body wrapped in try/catch. An exception from readSha,
   saveState, copyFile, or any step now lands in a rolling-back persist +
   returns failed-checkout, so the route's post-executor rollback path
   picks it up. State can no longer wedge at 'executing'. The catch's
   inner saveState is itself try/wrapped so a write-after-write failure
   doesn't crash the route either.

CI: Playwright update-page-actions strict-mode violation fixed. Both the
banner and the lastResult 

contain "Manual intervention required"; selector now scopes to p.last-result-rollback-failed for the lastResult assertion specifically. 129 vitest unit tests + 23 mocha integration tests passing; ts-check clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/handler/PadMessageHandler.ts | 5 +- src/node/hooks/express/updateActions.ts | 42 +++- src/node/updater/RollbackHandler.ts | 15 +- src/node/updater/SessionDrainer.ts | 10 +- src/node/updater/UpdateExecutor.ts | 192 ++++++++++++------ src/node/updater/trustedKeys.ts | 17 +- .../specs/updater/SessionDrainer.test.ts | 10 + src/tests/backend/specs/updateActions.ts | 44 ++-- .../admin-spec/update-page-actions.spec.ts | 5 +- 9 files changed, 246 insertions(+), 94 deletions(-) diff --git a/src/node/handler/PadMessageHandler.ts b/src/node/handler/PadMessageHandler.ts index 3a77c972ba4..22f80ea8dc6 100644 --- a/src/node/handler/PadMessageHandler.ts +++ b/src/node/handler/PadMessageHandler.ts @@ -381,8 +381,11 @@ exports.handleMessage = async (socket:any, message: ClientVarMessage) => { // Refuse new joiners while the updater drainer is running. Existing sockets // are unaffected — only the initial CLIENT_READY handshake is gated. The // pad UI will show the drain announcement separately via shoutMessage. + // Use socket.emit('message', ...) for consistency with the other disconnect + // paths in this file (see line ~221, 569). socket.json.send is a socket.io + // v2/v3-era API that may not exist on v4 Socket objects. if (!isAcceptingConnections()) { - socket.json.send({disconnect: 'updateInProgress'}); + socket.emit('message', {disconnect: 'updateInProgress'}); socket.disconnect(true); return; } diff --git a/src/node/hooks/express/updateActions.ts b/src/node/hooks/express/updateActions.ts index 01dbe72f1eb..8e321e14add 100644 --- a/src/node/hooks/express/updateActions.ts +++ b/src/node/hooks/express/updateActions.ts @@ -103,12 +103,34 @@ const buildPreflightDeps = (installMethod: ReturnType = new Set(['manual', 'auto', 'autonomous']); +const tierAllowsActions = (): boolean => TIER2_TIERS.has(settings.updates.tier); + export const expressCreateServer = ( _hookName: string, {app}: ArgsExpressType, cb: Function, ): void => { - if (settings.updates.tier === 'off') return cb(); + // Always register the routes; gate at request time so a runtime tier change + // takes effect on the next request rather than requiring a restart. + // The early 404 below preserves Qodo #1's "disabled path matches prior + // behaviour (no Tier 2 endpoints existed before this PR)" requirement. + const tierGate = (req: any, res: any, next: Function) => { + if (!tierAllowsActions()) return res.status(404).send('Not found'); + next(); + }; + app.use(['/admin/update/apply', '/admin/update/cancel', '/admin/update/acknowledge', '/admin/update/log'], tierGate); app.post('/admin/update/apply', wrapAsync(async (req: any, res: any) => { if (!requireAdmin(req, res)) return; @@ -186,6 +208,19 @@ export const expressCreateServer = ( return res.status(409).json({error: 'preflight-failed', reason: pf.reason}); } + // Re-check state after preflight: /admin/update/cancel may have flipped + // execution back to 'idle' while we were running the slow checks. The + // cancel handler intentionally leaves the lock alone (we own it) and + // signals via state instead, so a stale apply can detect cancellation + // here before mutating the filesystem. + const afterPreflight = await loadState(stateFilePath()); + if (afterPreflight.execution.status !== 'preflight' + || (afterPreflight.execution as {targetTag?: string}).targetTag !== targetTag) { + appendLine(logPath(), + `[${new Date().toISOString()}] APPLY aborted post-preflight (state=${afterPreflight.execution.status})`); + return res.status(409).json({error: 'cancelled-during-preflight'}); + } + // Drain — respond 202 first so the UI starts polling /log without waiting. const drainSeconds = Number(settings.updates.drainSeconds) || 60; drainer = createDrainer({ @@ -285,7 +320,10 @@ export const expressCreateServer = ( at, }, }); - try { await releaseLock(lockPath()); } catch {/* noop */} + // Intentionally do NOT release the lock here. The apply handler owns the + // lock for its lifetime and releases it in its finally block; releasing + // here would let a second apply slip in while the first is still mid- + // preflight, racing for the same on-disk state. appendLine(logPath(), `[${at}] CANCEL by admin during status=${state.execution.status}`); res.json({cancelled: true}); })); diff --git a/src/node/updater/RollbackHandler.ts b/src/node/updater/RollbackHandler.ts index 9cea0f359bf..7e24e95e382 100644 --- a/src/node/updater/RollbackHandler.ts +++ b/src/node/updater/RollbackHandler.ts @@ -27,6 +27,12 @@ const runStep = ( cmd: string, args: string[], ): Promise => new Promise((resolve) => { + let settled = false; + const settle = (c: number | null) => { + if (settled) return; + settled = true; + resolve(c); + }; const child = spawnFn(cmd, args, {cwd, stdio: ['ignore', 'pipe', 'pipe']}); const tag = `${cmd} ${args.join(' ')}`; child.stdout.on('data', (b: Buffer) => { @@ -39,7 +45,14 @@ const runStep = ( logger.warn(`[rollback ${tag}] ${t}`); appendLine(logPath, `[${new Date().toISOString()}] rollback ${tag} ERR | ${t}`); }); - child.on('close', (c) => resolve(c)); + // Spawn failures (binary missing, permissions) — without this listener the + // promise hangs forever and the rollback path never lands on terminal state. + child.on('error', (err: Error) => { + logger.error(`[rollback ${tag}] spawn error: ${err.message}`); + appendLine(logPath, `[${new Date().toISOString()}] rollback ${tag} SPAWN_ERR | ${err.message}`); + settle(1); + }); + child.on('close', (c) => settle(c)); }); /** diff --git a/src/node/updater/SessionDrainer.ts b/src/node/updater/SessionDrainer.ts index acddc512e61..4f266f8cb77 100644 --- a/src/node/updater/SessionDrainer.ts +++ b/src/node/updater/SessionDrainer.ts @@ -55,10 +55,12 @@ export const createDrainer = ({drainSeconds, broadcast}: DrainerOpts): Drainer = timers.push(setTimeout(() => fire('update.drain.t10', 10), Math.max(0, ms - 10_000))); timers.push(setTimeout(() => { if (cancelled) return; - // Don't restore acceptingConnections — the executor is about to exit 75 - // and the supervisor restart will reset module state. Leaving the flag - // off until exit means stragglers can't slip in between drain end and - // exit(). + // Restore the gate as soon as the drain window closes. The executor + // takes over from here and the supervisor restart wipes module state + // anyway; if the executor throws and the process keeps running, we + // want join handshakes to recover rather than stay wedged. + // The lock + state.execution.status guarantee no fresh apply can race. + acceptingConnections = true; resolveDone?.({outcome: 'completed'}); resolveDone = null; }, ms)); diff --git a/src/node/updater/UpdateExecutor.ts b/src/node/updater/UpdateExecutor.ts index 85229ed4f91..a73b6e402e0 100644 --- a/src/node/updater/UpdateExecutor.ts +++ b/src/node/updater/UpdateExecutor.ts @@ -6,11 +6,16 @@ import {appendLine} from './updateLog'; const logger = log4js.getLogger('updater'); -export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { +export interface SpawnedChild { stdout: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; stderr: {on: (event: 'data', cb: (chunk: Buffer) => void) => void}; - on: (event: 'close', cb: (code: number | null) => void) => void; -}; + on: { + (event: 'close', cb: (code: number | null) => void): void; + (event: 'error', cb: (err: Error) => void): void; + }; +} + +export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => SpawnedChild; export interface ExecutorDeps { /** Path of the on-disk Etherpad install (the git working tree). */ @@ -49,6 +54,12 @@ const runStep = ( args: string[], ): Promise<{code: number | null; stderr: string}> => new Promise((resolve) => { let stderr = ''; + let settled = false; + const settle = (v: {code: number | null; stderr: string}) => { + if (settled) return; + settled = true; + resolve(v); + }; const child = spawnFn(cmd, args, {cwd: repoDir, stdio: ['ignore', 'pipe', 'pipe']}); const tag = `${cmd} ${args.join(' ')}`; child.stdout.on('data', (chunk: Buffer) => { @@ -63,7 +74,16 @@ const runStep = ( logger.warn(`[${tag}] ${trimmed}`); appendLine(logPath, `[${new Date().toISOString()}] ${tag} ERR | ${trimmed}`); }); - child.on('close', (code) => resolve({code, stderr})); + // Spawn failures (binary missing, permissions) emit 'error' and never close. + // Without this listener the promise hangs forever and leaves state in-flight. + // Treat as exit code 1 with the error message in stderr so the caller's + // failure-detection branch fires normally. + child.on('error', (err: Error) => { + logger.error(`[${tag}] spawn error: ${err.message}`); + appendLine(logPath, `[${new Date().toISOString()}] ${tag} SPAWN_ERR | ${err.message}`); + settle({code: 1, stderr: stderr + err.message}); + }); + child.on('close', (code) => settle({code, stderr})); }); /** @@ -74,79 +94,117 @@ const runStep = ( * persists, and returns. The route layer then runs RollbackHandler.performRollback. * The executor does NOT call `exit` on failure paths — the rollback path owns * that exit so we don't double-exit and lose log lines. + * + * On a thrown exception (e.g., copyFile EACCES, saveState ENOSPC) the executor + * also transitions to rolling-back with `failed-checkout` so the route's post- + * executor rollback path picks it up. The state must never get stuck at + * `executing` — if it does, no further updates can start until an admin + * acknowledges. */ export const executeUpdate = async (deps: ExecutorDeps): Promise => { - const fromSha = await deps.readSha(); const logPath = path.join(deps.repoDir, 'var', 'log', 'update.log'); + let fromSha = ''; - let s: UpdateState = { - ...deps.initialState, - execution: { - status: 'executing', - targetTag: deps.targetTag, - fromSha, - startedAt: deps.now().toISOString(), - }, - bootCount: 0, - }; - await deps.saveState(s); - - // Snapshot lockfile (SHA already captured above; the rollback handler reads - // execution.fromSha rather than a separate file so a successful rollback - // doesn't depend on /var staying writable past this point). - await deps.copyFile( - path.join(deps.repoDir, 'pnpm-lock.yaml'), - path.join(deps.backupDir, 'pnpm-lock.yaml'), - ); - - const fail = async ( - outcome: 'failed-install' | 'failed-build' | 'failed-checkout', - reason: string, - ): Promise => { + // Wrap the whole body so any throw — readSha, saveState, copyFile, even an + // unexpected synchronous error in a step — lands us at rolling-back rather + // than leaving execution stuck at 'executing' forever. + try { + fromSha = await deps.readSha(); + + let s: UpdateState = { + ...deps.initialState, + execution: { + status: 'executing', + targetTag: deps.targetTag, + fromSha, + startedAt: deps.now().toISOString(), + }, + bootCount: 0, + }; + await deps.saveState(s); + + // Snapshot lockfile (SHA already captured above; the rollback handler reads + // execution.fromSha rather than a separate file so a successful rollback + // doesn't depend on /var staying writable past this point). + await deps.copyFile( + path.join(deps.repoDir, 'pnpm-lock.yaml'), + path.join(deps.backupDir, 'pnpm-lock.yaml'), + ); + + const fail = async ( + outcome: 'failed-install' | 'failed-build' | 'failed-checkout', + reason: string, + ): Promise => { + s = { + ...s, + execution: { + status: 'rolling-back', + reason, + targetTag: deps.targetTag, + fromSha, + at: deps.now().toISOString(), + }, + }; + await deps.saveState(s); + logger.error(`update step failed (${outcome}): ${reason}`); + appendLine(logPath, `[${deps.now().toISOString()}] FAIL ${outcome}: ${reason}`); + return {outcome, reason}; + }; + + let r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['fetch', '--tags', 'origin']); + if (r.code !== 0) return fail('failed-checkout', `git fetch exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', deps.targetTag]); + if (r.code !== 0) return fail('failed-checkout', `git checkout exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); + if (r.code !== 0) return fail('failed-install', `pnpm install exit ${r.code}: ${r.stderr.trim()}`); + + r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['run', 'build:ui']); + if (r.code !== 0) return fail('failed-build', `pnpm run build:ui exit ${r.code}: ${r.stderr.trim()}`); + + // pending-verification: the next boot's RollbackHandler arms the health-check timer. s = { ...s, execution: { - status: 'rolling-back', - reason, + status: 'pending-verification', targetTag: deps.targetTag, fromSha, - at: deps.now().toISOString(), + // Real deadline is computed at next boot using rollbackHealthCheckSeconds. + // We persist a placeholder here purely so the field is present. + deadlineAt: deps.now().toISOString(), }, + bootCount: 0, }; await deps.saveState(s); - logger.error(`update step failed (${outcome}): ${reason}`); - appendLine(logPath, `[${deps.now().toISOString()}] FAIL ${outcome}: ${reason}`); - return {outcome, reason}; - }; - - let r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['fetch', '--tags', 'origin']); - if (r.code !== 0) return fail('failed-checkout', `git fetch exit ${r.code}: ${r.stderr.trim()}`); - - r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', deps.targetTag]); - if (r.code !== 0) return fail('failed-checkout', `git checkout exit ${r.code}: ${r.stderr.trim()}`); - - r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); - if (r.code !== 0) return fail('failed-install', `pnpm install exit ${r.code}: ${r.stderr.trim()}`); - - r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['run', 'build:ui']); - if (r.code !== 0) return fail('failed-build', `pnpm run build:ui exit ${r.code}: ${r.stderr.trim()}`); - - // pending-verification: the next boot's RollbackHandler arms the health-check timer. - s = { - ...s, - execution: { - status: 'pending-verification', - targetTag: deps.targetTag, - fromSha, - // Real deadline is computed at next boot using rollbackHealthCheckSeconds. - // We persist a placeholder here purely so the field is present. - deadlineAt: deps.now().toISOString(), - }, - bootCount: 0, - }; - await deps.saveState(s); - logger.info(`update executed: ${fromSha} -> ${deps.targetTag}; exiting 75 for supervisor restart`); - void appendLine(logPath, `[${deps.now().toISOString()}] OK pending-verification ${fromSha} -> ${deps.targetTag}; exiting 75`); - deps.exit(75); - return {outcome: 'pending-verification'}; + logger.info(`update executed: ${fromSha} -> ${deps.targetTag}; exiting 75 for supervisor restart`); + void appendLine(logPath, `[${deps.now().toISOString()}] OK pending-verification ${fromSha} -> ${deps.targetTag}; exiting 75`); + deps.exit(75); + return {outcome: 'pending-verification'}; + } catch (err) { + // Unexpected throw — fs ENOSPC, EACCES on the backup dir, network blip + // surfaced through readSha, etc. Persist rolling-back so the route's + // post-executor rollback path runs and the state never wedges at 'executing'. + const reason = `executor exception: ${(err as Error).message}`; + logger.error(reason); + void appendLine(logPath, `[${deps.now().toISOString()}] EXECUTOR_THROW ${reason}`); + try { + await deps.saveState({ + ...deps.initialState, + execution: { + status: 'rolling-back', + reason, + targetTag: deps.targetTag, + fromSha, + at: deps.now().toISOString(), + }, + bootCount: 0, + }); + } catch (saveErr) { + // Even saveState threw. Best-effort log, rethrow original — the route's + // catch will surface it. State on disk is whatever last successfully wrote. + logger.error(`could not persist rolling-back: ${(saveErr as Error).message}`); + } + return {outcome: 'failed-checkout', reason}; + } }; diff --git a/src/node/updater/trustedKeys.ts b/src/node/updater/trustedKeys.ts index 03b9ceff772..737e32b0180 100644 --- a/src/node/updater/trustedKeys.ts +++ b/src/node/updater/trustedKeys.ts @@ -4,7 +4,10 @@ import log4js from 'log4js'; const logger = log4js.getLogger('updater'); export type SpawnFn = (cmd: string, args: string[], opts: SpawnOptions) => { - on: (event: 'close', cb: (code: number | null) => void) => void; + on: { + (event: 'close', cb: (code: number | null) => void): void; + (event: 'error', cb: (err: Error) => void): void; + }; }; export interface VerifyArgs { @@ -46,7 +49,17 @@ export const verifyReleaseTag = async (args: VerifyArgs): Promise env, stdio: 'ignore', }); - const code: number | null = await new Promise((resolve) => child.on('close', resolve)); + // Listen for both 'close' and 'error' so a missing/unexecutable git binary + // surfaces as verification-failure rather than a hung promise. + const code: number | null = await new Promise((resolve) => { + let settled = false; + const settle = (c: number | null) => { if (settled) return; settled = true; resolve(c); }; + child.on('close', settle); + child.on('error', (err: Error) => { + logger.error(`verifyReleaseTag: git verify-tag spawn error: ${err.message}`); + settle(1); + }); + }); if (code === 0) return {ok: true, reason: 'signature-verified'}; logger.error(`verifyReleaseTag: git verify-tag ${args.tag} exited ${code}`); return {ok: false, reason: 'signature-verification-failed'}; diff --git a/src/tests/backend-new/specs/updater/SessionDrainer.test.ts b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts index ee5db23c880..c870711a4e0 100644 --- a/src/tests/backend-new/specs/updater/SessionDrainer.test.ts +++ b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts @@ -31,6 +31,16 @@ describe('SessionDrainer', () => { expect(isAcceptingConnections()).toBe(true); }); + it('restores isAcceptingConnections to true on drain completion', async () => { + const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); + const done = drainer.start(); + expect(isAcceptingConnections()).toBe(false); + await vi.advanceTimersByTimeAsync(60_000); + await done; + // Restored at completion so a downstream throw doesn't wedge join handshakes. + expect(isAcceptingConnections()).toBe(true); + }); + it('cancel before T=0 resolves start() promise as cancelled', async () => { const drainer = createDrainer({drainSeconds: 60, broadcast: () => {}}); const done = drainer.start(); diff --git a/src/tests/backend/specs/updateActions.ts b/src/tests/backend/specs/updateActions.ts index faeb24b874c..6e06577cb7b 100644 --- a/src/tests/backend/specs/updateActions.ts +++ b/src/tests/backend/specs/updateActions.ts @@ -30,8 +30,18 @@ const installAdminAuth = () => { describe(__filename, function () { let agent: any; const backups: Record = {}; + // Bump tier to 'manual' so the action endpoints are mounted by the hook. + // (At default tier 'notify' they 404 — that's the gate Qodo #1 introduced.) + const originalTier = settings.updates.tier; - before(async () => { agent = await common.init(); }); + before(async () => { + settings.updates.tier = 'manual'; + agent = await common.init(); + }); + + after(() => { + settings.updates.tier = originalTier; + }); beforeEach(async () => { backups.hooks = {}; @@ -63,24 +73,26 @@ describe(__filename, function () { await agent.post('/admin/update/apply').expect(401); }); - it('rejects when policy denies (non-git install method)', async () => { + it('returns 409 with no-known-latest when state has no latest release', async () => { installAdminAuth(); - // Force the detector path: the boot detector ran with the real install - // method, but evaluatePolicy uses settings.updates.installMethod via the - // hook's getDetectedInstallMethod(). We can't easily flip that mid-test, - // so instead we set tier=off which also denies canManual. + // Replace seeded "update available" with empty state. + await saveState(statePath(), {...EMPTY_STATE}); + const r = await agent.post('/admin/update/apply') + .auth('admin', 'admin-pw') + .expect(409); + assert.equal(r.body.error, 'no-known-latest'); + }); + + it('returns 404 when tier is "notify" (action endpoints disabled)', async () => { + // Regression for the Tier 2 gate (Qodo #1): disabled tiers must 404 to + // match prior PR-1 behaviour, not 401/403/409. const orig = settings.updates.tier; - settings.updates.tier = 'off'; + settings.updates.tier = 'notify'; try { - await agent.post('/admin/update/apply') - .auth('admin', 'admin-pw') - .expect((r: any) => { - // tier=off removes the entire route registration, so we expect 404. - // tier !== off and policy.canManual=false would expect 409. Either is OK. - if (r.status !== 404 && r.status !== 409) { - throw new Error(`expected 404 or 409, got ${r.status}`); - } - }); + await agent.post('/admin/update/apply').expect(404); + await agent.post('/admin/update/cancel').expect(404); + await agent.post('/admin/update/acknowledge').expect(404); + await agent.get('/admin/update/log').expect(404); } finally { settings.updates.tier = orig; } }); diff --git a/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts b/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts index e52e6539e71..bdca6df7e45 100644 --- a/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts +++ b/src/tests/frontend-new/admin-spec/update-page-actions.spec.ts @@ -93,7 +93,10 @@ test.describe('admin update page actions', () => { await page.goto('http://localhost:9001/admin/update'); await expect(page.getByRole('button', {name: /acknowledge/i})).toBeVisible({timeout: 30000}); // lastResult copy uses i18n update.page.last_result.rollback-failed. - await expect(page.getByText(/Manual intervention required/i)).toBeVisible(); + // Both the banner and the lastResult paragraph contain "Manual intervention + // required" — scope to the lastResult

so we get exactly one match. + await expect(page.locator('p.last-result-rollback-failed')).toBeVisible(); + await expect(page.locator('p.last-result-rollback-failed')).toContainText(/Manual intervention required/i); }); test('lockHeld true hides the Apply button even when policy.canManual is on', async ({page}) => { From a32256b632a72b3887fd72778fe6c2b45c802f5d Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:46:01 +0100 Subject: [PATCH 20/21] fix(updater): address Qodo #7 (status leak) + #8 (short-drain values) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #7. /admin/update/status now redacts diagnostic strings for unauth callers even when requireAdminForStatus is left at its default (false). Status enum + outcome enum are kept (the admin banner / pad-side badge need them to render the right UI) but execution.reason / execution.fromSha / execution.targetTag and the same fields on lastResult are stripped. Authed admin sessions still get the full payload — they're looking at their own server's diagnostics. Two new mocha tests cover both paths: "redacts execution.reason / lastResult.reason for unauth callers" and "returns full diagnostic payload to authed admin sessions". #8. SessionDrainer no longer schedules T-30 / T-10 broadcasts when the configured drainSeconds can't honour them. Previously, with drainSeconds < 30 the T-30 timer fired at zero remaining but the broadcast still claimed "30 seconds" — misleading. Now T-30 only schedules when drainSeconds > 30 and T-10 only when > 10. Admins picking a short drain get fewer announcements but each carries an accurate countdown. The opening announcement now reports the configured drain length rather than a hardcoded 60. Two updated unit tests: drainSeconds=15 (skips T-30, still fires T-10) and drainSeconds=5 (skips both). 131 vitest unit + 26 mocha integration tests passing; ts-check clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/hooks/express/updateStatus.ts | 36 +++++++++- src/node/updater/SessionDrainer.ts | 18 +++-- .../specs/updater/SessionDrainer.test.ts | 33 ++++++--- src/tests/backend/specs/updateStatus.ts | 72 +++++++++++++++++++ 4 files changed, 144 insertions(+), 15 deletions(-) diff --git a/src/node/hooks/express/updateStatus.ts b/src/node/hooks/express/updateStatus.ts index 24ab729e96b..69d63d889f3 100644 --- a/src/node/hooks/express/updateStatus.ts +++ b/src/node/hooks/express/updateStatus.ts @@ -39,6 +39,23 @@ const wrapAsync = (fn: (req: any, res: any, next: Function) => Promise) Promise.resolve(fn(req, res, next)).catch((err) => next(err)); }; +/** + * Strip diagnostic strings (reason, fromSha, targetTag, build/install paths) + * from execution before exposing to unauthenticated callers. Status enum is + * preserved so the admin banner / pad-side badge can still render the right UI. + */ +const sanitizeExecution = (e: any): any => { + if (!e || typeof e !== 'object' || typeof e.status !== 'string') return {status: 'idle'}; + return {status: e.status}; +}; + +const sanitizeLastResult = (r: any): any => { + if (r === null) return null; + if (!r || typeof r !== 'object' || typeof r.outcome !== 'string') return null; + // outcome enum + at timestamp are non-sensitive. reason / fromSha / targetTag are dropped. + return {outcome: r.outcome, at: typeof r.at === 'string' ? r.at : null}; +}; + export const expressCreateServer = ( _hookName: string, {app}: ArgsExpressType, @@ -70,6 +87,7 @@ export const expressCreateServer = ( // release. Admins who want the endpoint gated to authenticated admin sessions — // without disabling the updater entirely — set updates.requireAdminForStatus=true. app.get('/admin/update/status', wrapAsync(async (req, res) => { + const isAdmin = !!req.session?.user?.is_admin; if (settings.updates.requireAdminForStatus) { const user = req.session?.user; if (!user) return res.status(401).send('Authentication required'); @@ -88,6 +106,20 @@ export const expressCreateServer = ( }) : null; const lockHeld = await isHeld(path.join(settings.root, 'var', 'update.lock')); + + // The Tier 2 fields (execution, lastResult) carry diagnostic strings + // built from git/pnpm stderr — environment-specific paths, error + // messages, etc. Endpoint defaults to unauthenticated; only authed + // admin sessions see the full diagnostic payload. Everyone else sees + // just the status enum + outcome enum so the pad-side / public banners + // can still render correctly without leaking operational detail. + const execution = isAdmin + ? state.execution + : sanitizeExecution(state.execution); + const lastResult = isAdmin + ? state.lastResult + : sanitizeLastResult(state.lastResult); + res.json({ currentVersion: current, latest: state.latest, @@ -97,8 +129,8 @@ export const expressCreateServer = ( policy, vulnerableBelow: state.vulnerableBelow, // PR 2 additions: - execution: state.execution, - lastResult: state.lastResult, + execution, + lastResult, lockHeld, }); })); diff --git a/src/node/updater/SessionDrainer.ts b/src/node/updater/SessionDrainer.ts index 4f266f8cb77..d9df727b2c5 100644 --- a/src/node/updater/SessionDrainer.ts +++ b/src/node/updater/SessionDrainer.ts @@ -48,11 +48,21 @@ export const createDrainer = ({drainSeconds, broadcast}: DrainerOpts): Drainer = return new Promise((resolve) => { resolveDone = resolve; const ms = drainSeconds * 1000; - // T-60 announcement fires at start; T-30 and T-10 are scheduled at offsets. - // Drain windows shorter than 30s collapse the early timers to "fire ASAP". + // The opening announcement reports the actual drain length rather than a + // hardcoded 60, so a configured drainSeconds of e.g. 30 says "30 seconds". + // i18n key is still update.drain.t60 — that's the "start of drain" key in + // the locale file; the {{seconds}} placeholder carries the real value. fire('update.drain.t60', drainSeconds); - timers.push(setTimeout(() => fire('update.drain.t30', 30), Math.max(0, ms - 30_000))); - timers.push(setTimeout(() => fire('update.drain.t10', 10), Math.max(0, ms - 10_000))); + // Only schedule T-30 / T-10 when the configured window can actually + // honour them. Firing a "30 seconds" message at zero remaining (because + // ms - 30_000 < 0) is misleading; admins picking a short drainSeconds + // get fewer announcements but each carries an accurate countdown. + if (drainSeconds > 30) { + timers.push(setTimeout(() => fire('update.drain.t30', 30), ms - 30_000)); + } + if (drainSeconds > 10) { + timers.push(setTimeout(() => fire('update.drain.t10', 10), ms - 10_000)); + } timers.push(setTimeout(() => { if (cancelled) return; // Restore the gate as soon as the drain window closes. The executor diff --git a/src/tests/backend-new/specs/updater/SessionDrainer.test.ts b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts index c870711a4e0..8d005035ad4 100644 --- a/src/tests/backend-new/specs/updater/SessionDrainer.test.ts +++ b/src/tests/backend-new/specs/updater/SessionDrainer.test.ts @@ -77,19 +77,34 @@ describe('SessionDrainer', () => { expect(seen[2]).toEqual({key: 'update.drain.t10', values: {seconds: 10}}); }); - it('drain shorter than 30s skips the t30 broadcast but still emits t10 and completes', async () => { - const broadcasts: string[] = []; + it('drainSeconds=15 skips t30 (window too short) but still fires t10', async () => { + const seen: Array<{key: string; values: any}> = []; const drainer = createDrainer({ drainSeconds: 15, - broadcast: (key) => { broadcasts.push(key); }, + broadcast: (key, values) => { seen.push({key, values}); }, }); const done = drainer.start(); - expect(broadcasts).toEqual(['update.drain.t60']); - // t30 fires at max(0, 15-30)=0 i.e. immediately on next tick. - await vi.advanceTimersByTimeAsync(0); - expect(broadcasts).toContain('update.drain.t30'); - await vi.advanceTimersByTimeAsync(15_000); + // Opening announcement reports the configured drain length, not a fixed 60. + expect(seen).toEqual([{key: 'update.drain.t60', values: {seconds: 15}}]); + // t30 is suppressed because reporting "30 seconds" would be wrong. + await vi.advanceTimersByTimeAsync(5_000); + expect(seen.map((s) => s.key)).not.toContain('update.drain.t30'); + // t10 fires when 10 seconds remain (= 5s from start of a 15s drain). + expect(seen.map((s) => s.key)).toContain('update.drain.t10'); + await vi.advanceTimersByTimeAsync(10_000); + await done; + }); + + it('drainSeconds=5 skips both t30 and t10', async () => { + const seen: string[] = []; + const drainer = createDrainer({ + drainSeconds: 5, + broadcast: (key) => { seen.push(key); }, + }); + const done = drainer.start(); + expect(seen).toEqual(['update.drain.t60']); + await vi.advanceTimersByTimeAsync(5_000); await done; - expect(broadcasts.at(-1)).toBe('update.drain.t10'); + expect(seen).toEqual(['update.drain.t60']); // only the opening announcement }); }); diff --git a/src/tests/backend/specs/updateStatus.ts b/src/tests/backend/specs/updateStatus.ts index 942f5a255c6..e8fb02fa03e 100644 --- a/src/tests/backend/specs/updateStatus.ts +++ b/src/tests/backend/specs/updateStatus.ts @@ -88,6 +88,40 @@ describe(__filename, function () { assert.ok(Array.isArray(res.body.vulnerableBelow)); }); + it('redacts execution.reason / lastResult.reason for unauth callers', async function () { + // Seed state with diagnostic strings that would leak environment details. + await saveState(statePath(), { + ...EMPTY_STATE, + execution: { + status: 'rollback-failed', + reason: 'pnpm install exit 1: ENOSPC at /srv/etherpad/v2.7.3', + targetTag: 'v2.7.3', + fromSha: 'abc123def456', + at: '2026-05-08T00:00:00Z', + }, + lastResult: { + targetTag: 'v2.7.3', + fromSha: 'abc123def456', + outcome: 'rollback-failed', + reason: 'pnpm install failed: ENOSPC at /srv/etherpad/v2.7.3', + at: '2026-05-08T00:00:00Z', + }, + }); + const res = await agent.get('/admin/update/status').expect(200); + // Status enum + outcome enum are kept (UI needs them). + assert.equal(res.body.execution.status, 'rollback-failed'); + assert.equal(res.body.lastResult.outcome, 'rollback-failed'); + // Diagnostic fields are stripped for unauth callers. + assert.equal(res.body.execution.reason, undefined); + assert.equal(res.body.execution.fromSha, undefined); + assert.equal(res.body.execution.targetTag, undefined); + assert.equal(res.body.lastResult.reason, undefined); + assert.equal(res.body.lastResult.fromSha, undefined); + assert.equal(res.body.lastResult.targetTag, undefined); + // Non-sensitive fields preserved on lastResult. + assert.equal(res.body.lastResult.at, '2026-05-08T00:00:00Z'); + }); + describe('when updates.requireAdminForStatus = true', function () { const restore: Record = {}; beforeEach(function () { @@ -140,5 +174,43 @@ describe(__filename, function () { .expect(200); }); }); + + describe('admin auth (without requireAdminForStatus)', function () { + // requireAdminForStatus=false (default) keeps the endpoint open for the + // pad-side / banner usage, but admin callers should still see full + // diagnostic detail (execution.reason, fromSha, etc.). + it('returns full diagnostic payload to authed admin sessions', async function () { + for (const hookName of authHookNames.concat(failHookNames)) plugins.hooks[hookName] = []; + plugins.hooks.authenticate = [{ + hook_fn: (_hookName: string, ctx: any, cb: Function) => { + ctx.req.session.user = {is_admin: true}; + cb([true]); + }, + }]; + (settings as any).requireAuthentication = true; + (settings as any).requireAuthorization = false; + (settings as any).users = {admin: {password: 'admin-password', is_admin: true}}; + await saveState(statePath(), { + ...EMPTY_STATE, + execution: { + status: 'rollback-failed', + reason: 'pnpm install exit 1', + targetTag: 'v2.7.3', fromSha: 'abc', + at: '2026-05-08T00:00:00Z', + }, + lastResult: { + targetTag: 'v2.7.3', fromSha: 'abc', + outcome: 'rollback-failed', reason: 'pnpm install failed', + at: '2026-05-08T00:00:00Z', + }, + }); + const res = await agent.get('/admin/update/status') + .auth('admin', 'admin-password').expect(200); + // Admin sees the full diagnostic detail (it's their own server). + assert.equal(res.body.execution.reason, 'pnpm install exit 1'); + assert.equal(res.body.execution.fromSha, 'abc'); + assert.equal(res.body.lastResult.reason, 'pnpm install failed'); + }); + }); }); }); From 9340d279ff9da2333ace63fa95581f87ccaa6ecb Mon Sep 17 00:00:00 2001 From: John McLear Date: Fri, 8 May 2026 13:58:58 +0100 Subject: [PATCH 21/21] =?UTF-8?q?fix(updater):=20address=20Qodo=20follow-u?= =?UTF-8?q?p=20=E2=80=94=20tag=20injection,=20rollback=20rejections,=20sta?= =?UTF-8?q?te=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qodo posted three new concerns after the first fix push. 1. Git tag option injection (security). The release tag from GitHub's tag_name flowed into `git checkout` / `git verify-tag` as a positional arg. A tag starting with '-' would be parsed as an option and could bypass signature verification or change checkout semantics. Mitigated in three layers: - New refSafety helper (isValidTag / assertValidTag / refsTagsForm) enforces a strict subset of git's check-ref-format spec: rejects leading '-' or '.', whitespace, control chars, and ~ ^ : ? * [ \\ and the '..' sequence. - VersionChecker validates tag_name before persisting to state, so a malformed value from a misconfigured githubRepo never lands on disk. - UpdateExecutor calls assertValidTag and uses the refs/tags/ form for git checkout. trustedKeys also validates and adds '--' to git verify-tag for an end-of-options marker. updateActions does an up-front isValidTag check on state.latest.tag so a corrupt state file gets a clean 409 instead of a 500. 2. Unhandled rollback rejections. checkPendingVerification was firing `void deps.saveState(...)` and `void performRollback(...)` without .catch(), so an fs error during boot's rollback path would bubble out as an unhandled rejection. Both callsites now go through fireSaveState / fireRollback helpers that catch and log; rollback rejections fall through to a best-effort terminal-state write + exit 75 so the supervisor can re-try the next boot with bootCount++. 3. Execution state under-validated. isValidExecution previously checked only that `status` was a known enum value, so a hand-edited state file with `{execution: {status: 'pending-verification'}}` (missing fromSha / targetTag / deadlineAt) would pass validation and reach RollbackHandler with undefined refs. The validator now consults a per-status required-fields map mirroring the ExecutionStatus union in types.ts and rejects empty strings as well as missing fields. Same tightening applied to lastResult.outcome (must be in the allowed enum, not just any string). Six new unit tests cover hand-edited corruption. 145 vitest + 26 mocha tests green; ts-check clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/node/hooks/express/updateActions.ts | 9 +++ src/node/updater/RollbackHandler.ts | 48 ++++++++++-- src/node/updater/UpdateExecutor.ts | 11 ++- src/node/updater/VersionChecker.ts | 10 +++ src/node/updater/refSafety.ts | 43 +++++++++++ src/node/updater/state.ts | 33 ++++++++- src/node/updater/trustedKeys.ts | 11 ++- .../specs/updater/UpdateExecutor.test.ts | 10 +-- .../specs/updater/refSafety.test.ts | 63 ++++++++++++++++ .../backend-new/specs/updater/state.test.ts | 73 +++++++++++++++++++ .../specs/updater/trustedKeys.test.ts | 17 ++++- 11 files changed, 312 insertions(+), 16 deletions(-) create mode 100644 src/node/updater/refSafety.ts create mode 100644 src/tests/backend-new/specs/updater/refSafety.test.ts diff --git a/src/node/hooks/express/updateActions.ts b/src/node/hooks/express/updateActions.ts index 8e321e14add..a951dd26b2e 100644 --- a/src/node/hooks/express/updateActions.ts +++ b/src/node/hooks/express/updateActions.ts @@ -17,6 +17,7 @@ import {verifyReleaseTag} from '../../updater/trustedKeys'; import {tailLines, appendLine} from '../../updater/updateLog'; import {performRollback} from '../../updater/RollbackHandler'; import {UpdateState} from '../../updater/types'; +import {isValidTag} from '../../updater/refSafety'; import {getIo} from './socketio'; const logger = log4js.getLogger('updater'); @@ -138,6 +139,14 @@ export const expressCreateServer = ( const state = await loadState(stateFilePath()); if (!state.latest) return res.status(409).json({error: 'no-known-latest'}); + // Defence in depth: VersionChecker validates tag_name before persisting, + // but a hand-edited update-state.json could still surface an unsafe tag + // here. Reject up-front rather than throw later when the executor calls + // assertValidTag, so the admin sees a clear 409 instead of a 500. + if (!isValidTag(state.latest.tag)) { + return res.status(409).json({error: 'invalid-tag-in-state'}); + } + // Allowed entry statuses: idle / verified / preflight-failed / rolled-back. // Anything else means an in-flight or terminal-needs-acknowledge state. const allowedEntry = ['idle', 'verified', 'preflight-failed', 'rolled-back']; diff --git a/src/node/updater/RollbackHandler.ts b/src/node/updater/RollbackHandler.ts index 7e24e95e382..e90e8b7fd15 100644 --- a/src/node/updater/RollbackHandler.ts +++ b/src/node/updater/RollbackHandler.ts @@ -161,21 +161,55 @@ export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps) const exec = state.execution; if (exec.status !== 'pending-verification') return {armed: false, markVerified: () => {}}; + // Fire-and-forget helpers that swallow rejections cleanly. We intentionally + // don't propagate — the boot sequence must proceed even if the rollback + // path can't write its terminal state. Worst case: the supervisor restart + // brings the same boot back up and the bootCount-based crash-loop guard + // catches it on the next attempt. + const fireRollback = (s: UpdateState) => { + void performRollback(s, deps).catch((err) => { + logger.error(`performRollback unhandled rejection: ${(err as Error).message}`); + // Best-effort: try to land on rollback-failed terminal state and exit + // 75 anyway. If saveState also rejects, log and exit so the supervisor + // restart at least re-runs checkPendingVerification with bootCount++. + const fb = { + ...s, + execution: { + status: 'rollback-failed' as const, + reason: `unhandled rollback rejection: ${(err as Error).message}`, + targetTag: (s.execution as {targetTag?: string}).targetTag ?? '', + fromSha: (s.execution as {fromSha?: string}).fromSha ?? '', + at: deps.now().toISOString(), + }, + bootCount: 0, + }; + void deps.saveState(fb).catch((saveErr) => { + logger.error(`fallback saveState rejected: ${(saveErr as Error).message}`); + }).finally(() => deps.exit(75)); + }); + }; + + const fireSaveState = (s: UpdateState, ctx: string) => { + void deps.saveState(s).catch((err) => { + logger.warn(`saveState (${ctx}) rejected: ${(err as Error).message}`); + }); + }; + if (state.bootCount > 2) { // Don't await — fire and forget so the boot sequence proceeds; the rollback // path will exit 75 asynchronously and the supervisor restarts on the - // restored SHA. - void performRollback(state, deps); + // restored SHA. Rejections caught + best-effort terminal-state write. + fireRollback(state); return {armed: false, markVerified: () => {}}; } const incremented: UpdateState = {...state, bootCount: state.bootCount + 1}; - void deps.saveState(incremented); + fireSaveState(incremented, 'bootCount-increment'); let cleared = false; const timer = setTimeout(() => { if (cleared) return; - void performRollback({ + fireRollback({ ...incremented, execution: { status: 'rolling-back', @@ -184,7 +218,7 @@ export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps) fromSha: exec.fromSha, at: deps.now().toISOString(), }, - }, deps); + }); }, deps.rollbackHealthCheckSeconds * 1000); return { @@ -194,7 +228,7 @@ export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps) cleared = true; clearTimeout(timer); const at = deps.now().toISOString(); - void deps.saveState({ + fireSaveState({ ...incremented, execution: {status: 'verified', targetTag: exec.targetTag, verifiedAt: at}, lastResult: { @@ -205,7 +239,7 @@ export const checkPendingVerification = (state: UpdateState, deps: RollbackDeps) at, }, bootCount: 0, - }); + }, 'mark-verified'); logger.info(`update verified after restart: ${exec.fromSha} -> ${exec.targetTag}`); }, }; diff --git a/src/node/updater/UpdateExecutor.ts b/src/node/updater/UpdateExecutor.ts index a73b6e402e0..07881065e46 100644 --- a/src/node/updater/UpdateExecutor.ts +++ b/src/node/updater/UpdateExecutor.ts @@ -3,6 +3,7 @@ import log4js from 'log4js'; import {SpawnOptions} from 'node:child_process'; import {UpdateState} from './types'; import {appendLine} from './updateLog'; +import {assertValidTag, refsTagsForm} from './refSafety'; const logger = log4js.getLogger('updater'); @@ -109,6 +110,10 @@ export const executeUpdate = async (deps: ExecutorDeps): Promise // unexpected synchronous error in a step — lands us at rolling-back rather // than leaving execution stuck at 'executing' forever. try { + // Reject unsafe release-tag strings (option injection guard). + // Tag is sourced from GitHub's tag_name and persisted into update-state.json; + // a tag starting with '-' would otherwise be parsed by git as an option flag. + const safeTag = assertValidTag(deps.targetTag); fromSha = await deps.readSha(); let s: UpdateState = { @@ -154,7 +159,11 @@ export const executeUpdate = async (deps: ExecutorDeps): Promise let r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['fetch', '--tags', 'origin']); if (r.code !== 0) return fail('failed-checkout', `git fetch exit ${r.code}: ${r.stderr.trim()}`); - r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', deps.targetTag]); + // Use the refs/tags/ form so even an unforeseen edge-case in the tag + // string can't be parsed as a git option. assertValidTag above already + // rules out leading '-' / whitespace / shell metacharacters. + r = await runStep( + deps.spawnFn, deps.repoDir, logPath, 'git', ['checkout', refsTagsForm(safeTag)]); if (r.code !== 0) return fail('failed-checkout', `git checkout exit ${r.code}: ${r.stderr.trim()}`); r = await runStep(deps.spawnFn, deps.repoDir, logPath, 'pnpm', ['install', '--frozen-lockfile']); diff --git a/src/node/updater/VersionChecker.ts b/src/node/updater/VersionChecker.ts index 8dc1f8d5f5d..ff4b0f34a52 100644 --- a/src/node/updater/VersionChecker.ts +++ b/src/node/updater/VersionChecker.ts @@ -1,5 +1,6 @@ import {ReleaseInfo, VulnerableBelowDirective} from './types'; import {parseVulnerableBelow} from './versionCompare'; +import {isValidTag} from './refSafety'; export interface FetchResult { status: number; @@ -49,6 +50,15 @@ export const checkLatestRelease = async ( return {kind: 'error', status: 200}; } + // Reject any tag that would be unsafe to hand to git later. Validating at + // the persistence boundary (rather than only at the executor) means a + // malformed tag_name from a misconfigured fork-as-github-repo never lands + // in update-state.json. Treated as a fetch error so the polling loop will + // try again next interval. + if (!isValidTag(j.tag_name)) { + return {kind: 'error', status: 200}; + } + const tag = j.tag_name; const version = tag.replace(/^v/, ''); const body: string = typeof j.body === 'string' ? j.body : ''; diff --git a/src/node/updater/refSafety.ts b/src/node/updater/refSafety.ts new file mode 100644 index 00000000000..e837a628ad8 --- /dev/null +++ b/src/node/updater/refSafety.ts @@ -0,0 +1,43 @@ +/** + * Safety helpers for any release-tag string Etherpad's updater hands to git. + * + * The release tag originates from GitHub's `releases/latest` API (`tag_name`) + * and is then persisted into `var/update-state.json`. A tag that starts with + * `-` would be parsed by git as an option flag rather than a positional ref — + * `git checkout -fast-forward` and similar tricks could bypass signature + * verification or change checkout semantics. A tag with shell metacharacters + * is less of an issue under `child_process.spawn` (no shell), but we reject + * those too because git's own ref-name rules forbid them and a malformed tag + * has nowhere reasonable to be honoured anyway. + * + * Rules (a subset of git's check-ref-format spec — strict on purpose): + * - Non-empty. + * - Length <= 200. + * - May not start with `-` (option injection) or `.` (git rejects). + * - May not contain whitespace, NUL, or any of: ~ ^ : ? * [ \\ + * - May not contain `..` (git's own rule). + * + * Callers should also use the `refs/tags/` form when invoking git so + * that even an unforeseen edge-case can't be parsed as an option, and pass + * `--` as an end-of-options marker on commands that accept it. + */ + +const FORBIDDEN_CHARS = /[\s\x00~^:?*\[\\]/; + +export const isValidTag = (tag: unknown): tag is string => { + if (typeof tag !== 'string') return false; + if (tag.length === 0 || tag.length > 200) return false; + if (tag.startsWith('-') || tag.startsWith('.')) return false; + if (FORBIDDEN_CHARS.test(tag)) return false; + if (tag.includes('..')) return false; + return true; +}; + +/** Throwing form for hot paths where invalid input is a programmer/data error. */ +export const assertValidTag = (tag: unknown): string => { + if (!isValidTag(tag)) throw new Error(`unsafe release tag: ${JSON.stringify(tag)}`); + return tag as string; +}; + +/** Wrap a validated tag in the `refs/tags/` form for git invocations. */ +export const refsTagsForm = (tag: string): string => `refs/tags/${tag}`; diff --git a/src/node/updater/state.ts b/src/node/updater/state.ts index 6c86d52257c..be425321681 100644 --- a/src/node/updater/state.ts +++ b/src/node/updater/state.ts @@ -8,17 +8,48 @@ const isPlainObject = (v: unknown): v is Record => const isStringOrNull = (v: unknown): v is string | null => v === null || typeof v === 'string'; +// Per-status field requirements that mirror the ExecutionStatus union in types.ts. +// Persisted-state corruption (a hand-edited file or a future schema bump that +// missed a migration) must never reach RollbackHandler with `undefined` refs — +// loadState resets to EMPTY_STATE when any required field is missing. +const EXEC_REQUIRED_FIELDS: Record = { + 'idle': [], + 'preflight': ['targetTag', 'startedAt'], + 'preflight-failed': ['targetTag', 'reason', 'at'], + 'draining': ['targetTag', 'drainEndsAt', 'startedAt'], + 'executing': ['targetTag', 'fromSha', 'startedAt'], + 'pending-verification': ['targetTag', 'fromSha', 'deadlineAt'], + 'verified': ['targetTag', 'verifiedAt'], + 'rolling-back': ['reason', 'targetTag', 'fromSha', 'at'], + 'rolled-back': ['reason', 'targetTag', 'restoredSha', 'at'], + 'rollback-failed': ['reason', 'targetTag', 'fromSha', 'at'], +}; + const isValidExecution = (v: unknown): boolean => { if (!isPlainObject(v)) return false; - return typeof v.status === 'string' && (EXECUTION_STATUSES as readonly string[]).includes(v.status); + if (typeof v.status !== 'string') return false; + if (!(EXECUTION_STATUSES as readonly string[]).includes(v.status)) return false; + const required = EXEC_REQUIRED_FIELDS[v.status]; + if (!required) return false; // unknown status — fail closed + for (const field of required) { + if (typeof (v as Record)[field] !== 'string') return false; + if (((v as Record)[field] as string).length === 0) return false; + } + return true; }; +// Outcomes that LastUpdateResult.outcome must match. +const VALID_OUTCOMES: ReadonlySet = new Set([ + 'verified', 'rolled-back', 'rollback-failed', 'preflight-failed', 'cancelled', +]); + const isValidLastResult = (v: unknown): boolean => { if (v === null) return true; if (!isPlainObject(v)) return false; return typeof v.targetTag === 'string' && typeof v.fromSha === 'string' && typeof v.outcome === 'string' + && VALID_OUTCOMES.has(v.outcome) && (v.reason === null || typeof v.reason === 'string') && typeof v.at === 'string'; }; diff --git a/src/node/updater/trustedKeys.ts b/src/node/updater/trustedKeys.ts index 737e32b0180..2d50a95c977 100644 --- a/src/node/updater/trustedKeys.ts +++ b/src/node/updater/trustedKeys.ts @@ -1,5 +1,6 @@ import {spawn as realSpawn, SpawnOptions} from 'node:child_process'; import log4js from 'log4js'; +import {isValidTag} from './refSafety'; const logger = log4js.getLogger('updater'); @@ -41,10 +42,18 @@ export const verifyReleaseTag = async (args: VerifyArgs): Promise ); return {ok: true, reason: 'signature-not-required'}; } + // Reject unsafe tag strings before they ever reach git. A tag starting with + // '-' could otherwise be parsed as a git option, bypassing verification. + if (!isValidTag(args.tag)) { + logger.error(`verifyReleaseTag: refused unsafe tag ${JSON.stringify(args.tag)}`); + return {ok: false, reason: 'signature-verification-failed'}; + } const spawnFn = args.spawnFn ?? (realSpawn as unknown as SpawnFn); const env: NodeJS.ProcessEnv = {...process.env}; if (args.trustedKeysPath) env.GNUPGHOME = args.trustedKeysPath; - const child = spawnFn('git', ['verify-tag', args.tag], { + // -- terminates options so even a future tag-validation regression can't + // smuggle a flag past git verify-tag. + const child = spawnFn('git', ['verify-tag', '--', args.tag], { cwd: args.repoDir, env, stdio: 'ignore', diff --git a/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts b/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts index 29a4374fa5c..18fa118fc1b 100644 --- a/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts +++ b/src/tests/backend-new/specs/updater/UpdateExecutor.test.ts @@ -39,7 +39,7 @@ const baseDeps = (): { backupDir: '/srv/etherpad/var/update-backup', spawnFn: okSpawn([ {cmd: 'git fetch --tags origin', exit: 0}, - {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'git checkout refs/tags/v2.7.3', exit: 0}, {cmd: 'pnpm install --frozen-lockfile', exit: 0}, {cmd: 'pnpm run build:ui', exit: 0}, ]) as any, @@ -84,7 +84,7 @@ describe('executeUpdate', () => { const {deps, states, exitedWith} = baseDeps(); deps.spawnFn = okSpawn([ {cmd: 'git fetch --tags origin', exit: 0}, - {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'git checkout refs/tags/v2.7.3', exit: 0}, {cmd: 'pnpm install --frozen-lockfile', exit: 1, stderr: 'resolver bork'}, ]) as any; const r = await executeUpdate(deps); @@ -98,7 +98,7 @@ describe('executeUpdate', () => { const {deps, states, exitedWith} = baseDeps(); deps.spawnFn = okSpawn([ {cmd: 'git fetch --tags origin', exit: 0}, - {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'git checkout refs/tags/v2.7.3', exit: 0}, {cmd: 'pnpm install --frozen-lockfile', exit: 0}, {cmd: 'pnpm run build:ui', exit: 2, stderr: 'tsc bork'}, ]) as any; @@ -114,7 +114,7 @@ describe('executeUpdate', () => { const {deps, copies, states} = baseDeps(); deps.spawnFn = okSpawn([ {cmd: 'git fetch --tags origin', exit: 0}, - {cmd: 'git checkout v2.7.3', exit: 1, stderr: 'conflict'}, + {cmd: 'git checkout refs/tags/v2.7.3', exit: 1, stderr: 'conflict'}, ]) as any; const r = await executeUpdate(deps); expect(r.outcome).toBe('failed-checkout'); @@ -136,7 +136,7 @@ describe('executeUpdate', () => { const {deps, states} = baseDeps(); deps.spawnFn = okSpawn([ {cmd: 'git fetch --tags origin', exit: 0}, - {cmd: 'git checkout v2.7.3', exit: 0}, + {cmd: 'git checkout refs/tags/v2.7.3', exit: 0}, {cmd: 'pnpm install --frozen-lockfile', exit: 1}, ]) as any; await executeUpdate(deps); diff --git a/src/tests/backend-new/specs/updater/refSafety.test.ts b/src/tests/backend-new/specs/updater/refSafety.test.ts new file mode 100644 index 00000000000..2f0032ba185 --- /dev/null +++ b/src/tests/backend-new/specs/updater/refSafety.test.ts @@ -0,0 +1,63 @@ +import {describe, it, expect} from 'vitest'; +import {isValidTag, assertValidTag, refsTagsForm} from '../../../../node/updater/refSafety'; + +describe('isValidTag', () => { + it('accepts plain semver tags', () => { + expect(isValidTag('v2.7.3')).toBe(true); + expect(isValidTag('2.7.3')).toBe(true); + expect(isValidTag('2.7.3-rc.1')).toBe(true); + }); + + it('rejects tags starting with -', () => { + expect(isValidTag('-rf')).toBe(false); + expect(isValidTag('-fast-forward')).toBe(false); + expect(isValidTag('-no-verify')).toBe(false); + }); + + it('rejects tags starting with .', () => { + expect(isValidTag('.git')).toBe(false); + }); + + it('rejects empty / non-string / overlong', () => { + expect(isValidTag('')).toBe(false); + expect(isValidTag(null)).toBe(false); + expect(isValidTag(undefined)).toBe(false); + expect(isValidTag(42)).toBe(false); + expect(isValidTag('v' + 'a'.repeat(300))).toBe(false); + }); + + it('rejects whitespace and control characters', () => { + expect(isValidTag('v2.7.3 -rf')).toBe(false); + expect(isValidTag('v2.7.3\nrm -rf')).toBe(false); + expect(isValidTag('v2.7.3\trf')).toBe(false); + expect(isValidTag('v2.7.3\x00rf')).toBe(false); + }); + + it('rejects git ref-format violations', () => { + expect(isValidTag('v2.7..3')).toBe(false); // .. forbidden + expect(isValidTag('v2~7~3')).toBe(false); // ~ forbidden + expect(isValidTag('v2:7:3')).toBe(false); // : forbidden + expect(isValidTag('v2.7.3?')).toBe(false); // ? forbidden + expect(isValidTag('v2.7.3*')).toBe(false); // * forbidden + expect(isValidTag('v[7]')).toBe(false); // [ forbidden + expect(isValidTag('v\\7')).toBe(false); // \ forbidden + expect(isValidTag('v^7')).toBe(false); // ^ forbidden + }); +}); + +describe('assertValidTag', () => { + it('returns the tag when valid', () => { + expect(assertValidTag('v2.7.3')).toBe('v2.7.3'); + }); + + it('throws on invalid input', () => { + expect(() => assertValidTag('-rf')).toThrow(/unsafe release tag/); + expect(() => assertValidTag(null)).toThrow(/unsafe release tag/); + }); +}); + +describe('refsTagsForm', () => { + it('wraps the tag in refs/tags/', () => { + expect(refsTagsForm('v2.7.3')).toBe('refs/tags/v2.7.3'); + }); +}); diff --git a/src/tests/backend-new/specs/updater/state.test.ts b/src/tests/backend-new/specs/updater/state.test.ts index fc87aac7f66..b230319c2aa 100644 --- a/src/tests/backend-new/specs/updater/state.test.ts +++ b/src/tests/backend-new/specs/updater/state.test.ts @@ -166,6 +166,79 @@ describe('Tier 2 state extensions', () => { expect(state).toEqual(EMPTY_STATE); }); + it('rejects pending-verification missing fromSha (could break rollback)', async () => { + // Regression for Qodo: hand-edited state with a recognised status but + // missing required fields would reach RollbackHandler with undefined refs. + // Validator must require per-status fields, not just status enum membership. + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'pending-verification', targetTag: 'v2.7.3', deadlineAt: '2026-05-08T00:00:00Z'}, + // fromSha intentionally missing + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('rejects rolling-back missing reason / targetTag', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'rolling-back', fromSha: 'abc', at: '2026-05-08T00:00:00Z'}, + // reason and targetTag missing + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('rejects empty-string fields for required keys', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'executing', targetTag: '', fromSha: 'abc', startedAt: '2026-05-08T00:00:00Z'}, + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + + it('accepts a fully-formed pending-verification', async () => { + const valid = { + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: { + status: 'pending-verification', + targetTag: 'v2.7.3', + fromSha: 'abc123', + deadlineAt: '2026-05-08T00:00:00Z', + }, + bootCount: 1, + lastResult: null, + }; + await fs.writeFile(statePath(), JSON.stringify(valid)); + const state = await loadState(statePath()); + expect(state.execution.status).toBe('pending-verification'); + }); + + it('rejects lastResult with an unrecognised outcome', async () => { + await fs.writeFile(statePath(), JSON.stringify({ + schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, + vulnerableBelow: [], + email: {severeAt: null, vulnerableAt: null, vulnerableNewReleaseTag: null}, + execution: {status: 'idle'}, + lastResult: { + targetTag: 'v2.7.3', fromSha: 'abc', + outcome: 'totally-made-up', + reason: null, at: '2026-05-08T00:00:00Z', + }, + })); + const state = await loadState(statePath()); + expect(state).toEqual(EMPTY_STATE); + }); + it('rejects a non-numeric bootCount by resetting to EMPTY_STATE', async () => { await fs.writeFile(statePath(), JSON.stringify({ schemaVersion: 1, lastCheckAt: null, lastEtag: null, latest: null, diff --git a/src/tests/backend-new/specs/updater/trustedKeys.test.ts b/src/tests/backend-new/specs/updater/trustedKeys.test.ts index 56d49471fea..fc92e24af7d 100644 --- a/src/tests/backend-new/specs/updater/trustedKeys.test.ts +++ b/src/tests/backend-new/specs/updater/trustedKeys.test.ts @@ -31,7 +31,9 @@ describe('verifyReleaseTag', () => { expect(r).toEqual({ok: true, reason: 'signature-verified'}); expect(spawnFn).toHaveBeenCalledWith( 'git', - ['verify-tag', 'v2.7.3'], + // -- terminates options so a future tag-validation regression can't + // smuggle a flag past git verify-tag. + ['verify-tag', '--', 'v2.7.3'], expect.objectContaining({cwd: '/tmp/x'}), ); }); @@ -64,6 +66,19 @@ describe('verifyReleaseTag', () => { expect(calls[0].env.GNUPGHOME).toBe('/srv/etherpad/keys'); }); + it('refuses unsafe tags (option-injection guard) before spawning git', async () => { + const spawnFn = vi.fn(); + const r = await verifyReleaseTag({ + tag: '-no-verify', + repoDir: '/tmp/x', + requireSignature: true, + trustedKeysPath: null, + spawnFn: spawnFn as any, + }); + expect(r).toEqual({ok: false, reason: 'signature-verification-failed'}); + expect(spawnFn).not.toHaveBeenCalled(); + }); + it('does not set GNUPGHOME when trustedKeysPath is null', async () => { const calls: any[] = []; const spawnFn = vi.fn((cmd: string, args: string[], opts: any) => {