From 5deaa5a8c5e899f1cf873c88bc7c30cdd138d4fa Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Wed, 27 May 2026 18:22:20 +0800
Subject: [PATCH 01/41] refactor(platform,cli): uniform org-first config layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collapse three divergent layout shapes (per-domain, @<org>-prefixed,
default-at-root) into one uniform rule: <root>/<org>/<domain>/...
applies to every org including `default`, in the live data tree,
the builtin catalog, the repo `examples/`, and the operator's host
workspace. `default` is just another org — the canonical template,
same shape.

Repo + builtin:
- git mv examples/<domain>/ → examples/default/<domain>/ (retention
  collapses to single examples/default/retention.json; branding moves
  under examples/default/branding/)
- services/convex/Dockerfile: one COPY examples/default/ → /app/builtin/default/
- Sweep load-bearing path strings in tests, GitHub raw URL, retention
  error messages, and docs (en/fr/de)

Convex resolvers (6 domains): single TALE_CONFIG_DIR root;
resolveXxxDir(org) = join(root, org, '<domain>') for every org. Drop
per-domain env overrides (AGENTS_DIR, WORKFLOWS_DIR, PROVIDERS_DIR,
INTEGRATIONS_DIR, SKILLS_DIR) — platform entrypoint now unconditionally
removes them from the Convex deployment env on every boot.

scaffold.ts:
- Default org is scaffold-able (no early-return); source from
  <catalog>/default/<domain> with realpath-based copy-onto-self guard
- New `override` arg with per-domain semantics (flat: per-file overwrite,
  bundle: rm-replace per bundle, tree: per-file recursive, retention:
  single-file copy); always preserves *.secrets.json + .history/
- cleanupOrgFilesystem: lstat symlink-hijack defense, two-phase
  rename-then-delete, dropped force:true on rm to surface real errors,
  removes one <root>/<org>/ subtree instead of per-domain loop
- New reseed_all_orgs.ts internal action — cursor-loop pagination,
  sorted slugs, per-org try/catch, structured per-org return shape

Platform consumers (previously hardcoding the old layout):
- server.ts + vite-plugins/serve-branding-images.ts: branding-images
  path → default/branding/images (would have 404'd post-rewrite)
- lib/config-watcher.ts: parseConfigChange rewritten for
  <org>/<domain>/<rest> shape (SSE invalidation would have silently
  dropped events for non-default orgs otherwise)
- config_store/store.ts: orgFirst option; retention flipped to
  <org>/retention.json with per-org-dir list() enumeration

Bash entrypoint (services/convex/docker-entrypoint.sh):
- mkdir creates only convex/ + default/ (legacy per-domain dirs gone)
- All run_seed loops retargeted source /app/builtin/default/<domain>
  → dest /app/data/default/<domain>; new branding seed loop closes a
  long-standing gap
- atomic_cp helper (cp tmp + mv) for crash safety
- Marker name carries -orgfirst layout token so downgrade re-seeds
  cleanly into legacy paths

CLI (tools/cli/):
- Delete the entire lib/upgrade/ auto-migration framework + four
  importers (deploy/start/update/init); -y/--yes on `tale start`
  kept as hidden no-op + warn-once for one-release back-compat
- tale init: scaffolds default/<domain>/... (was flat); recursive
  gitignore globs (**/.history/, **/*.secrets.json); OpenRouter
  secret lands at default/providers/openrouter.secrets.json
- tale deploy --override: rewrites to 1:1 host→/app/data/ push with
  allowlist filter (org-slug regex + reserved-domain-name denylist
  to detect legacy flat layout). Naive blocklist would have shipped
  .env / .git/ / .tale/ into /app/data/.
- New tale deploy --override-all (implies --all): runs server-side
  reseed via docker exec -i <platform> bash -s into the proven
  scripts/2026-03-28-migrate-convex-data.sh:120-131 pattern; TTY-gated
  confirm; non-zero exit on any per-org failure.
- New tale migrate config-layout [--dry-run] [--cleanup-old]: cp
  (not mv) so old paths stay readable for rollback; baked-in
  script.sh piped to docker exec -i <convex> bash -s; --cleanup-old
  sha-verifies before unlinking
- exec.ts: stdin support for the bash-via-stdin pattern

Tests: rewrote scaffold.test.ts (28 cases incl. override per-domain
semantics, symlink defenses, copy-onto-self guard, retention single
file, cleanupOrgFilesystem symlink hijack); rewrote skills/file_utils.test.ts
and branding/queries.test.ts for org-first; retention.test.ts uses
new example path. embedded-files.ts regenerated. bun run check passes:
36/36 tasks, 70927 tests, zero lint warnings.

Operator runbook (2 commands, zero downtime):
1. tale migrate config-layout — copies providers/*.secrets.json to
   new paths; old paths preserved
2. tale deploy --override-all -y — implies --all; recreates convex
   with new entrypoint, then triggers reseed-all-orgs action
3. (Optional) tale migrate config-layout --cleanup-old — sha-verifies
   new == old, unlinks olds (rollback insurance until then)

Behavior change — per-domain env overrides (AGENTS_DIR etc.) are no
longer honored. Operators with custom paths must set TALE_CONFIG_DIR
to a root and use the <org>/<domain>/ subtree.
---
 .dockerignore                                 |   2 +-
 docs/de/develop/integrations.md               |   2 +-
 docs/de/platform/integrations/overview.md     |   4 +-
 docs/de/platform/models.md                    |   4 +-
 .../de/self-hosted/configuration/providers.md |   2 +-
 docs/en/develop/integrations.md               |   2 +-
 docs/en/platform/integrations/overview.md     |   4 +-
 docs/en/platform/models.md                    |   4 +-
 .../en/self-hosted/configuration/providers.md |   2 +-
 docs/fr/develop/integrations.md               |   2 +-
 docs/fr/platform/integrations/overview.md     |   4 +-
 docs/fr/platform/models.md                    |   4 +-
 .../fr/self-hosted/configuration/providers.md |   2 +-
 examples/{ => default}/agents/chat-agent.json |   0
 .../{ => default}/agents/crm-assistant.json   |   0
 .../{ => default}/agents/image-creator.json   |   0
 .../agents/integration-assistant.json         |   0
 examples/{ => default}/agents/researcher.json |   0
 examples/{ => default}/agents/translator.json |   0
 .../agents/workflow-assistant.json            |   0
 examples/{ => default}/branding/branding.json |   0
 .../integrations/ai-image/config.json         |   0
 .../integrations/ai-image/connector.ts        |   0
 .../integrations/ai-image/icon.svg            |   0
 .../integrations/circuly/config.json          |   0
 .../integrations/circuly/connector.ts         |   0
 .../integrations/circuly/icon.svg             |   0
 .../integrations/discord/config.json          |   0
 .../integrations/discord/connector.ts         |   0
 .../integrations/discord/icon.svg             |   0
 .../integrations/github/config.json           |   0
 .../integrations/github/connector.ts          |   0
 .../integrations/github/icon.svg              |   0
 .../integrations/gmail/config.json            |   0
 .../integrations/gmail/connector.ts           |   0
 .../{ => default}/integrations/gmail/icon.svg |   0
 .../integrations/google_drive/config.json     |   0
 .../integrations/google_drive/connector.ts    |   0
 .../integrations/google_drive/icon.svg        |   0
 .../integrations/outlook/config.json          |   0
 .../integrations/outlook/connector.ts         |   0
 .../integrations/outlook/icon.svg             |   0
 .../integrations/protel/config.json           |   0
 .../integrations/protel/icon.svg              |   0
 .../integrations/shopify/config.json          |   0
 .../integrations/shopify/connector.ts         |   0
 .../integrations/shopify/icon.svg             |   0
 .../integrations/slack/config.json            |   0
 .../integrations/slack/connector.ts           |   0
 .../{ => default}/integrations/slack/icon.svg |   0
 .../integrations/tavily/config.json           |   0
 .../integrations/tavily/connector.ts          |   0
 .../integrations/tavily/icon.svg              |   0
 .../integrations/teams/config.json            |   0
 .../integrations/teams/connector.ts           |   0
 .../{ => default}/integrations/teams/icon.svg |   0
 .../integrations/twilio/config.json           |   0
 .../integrations/twilio/connector.ts          |   0
 .../integrations/twilio/icon.svg              |   0
 examples/{ => default}/providers/openai.json  |   0
 .../{ => default}/providers/openrouter.json   |   0
 .../providers/vercel-gateway.json             |   0
 .../default.json => default/retention.json}   |   0
 .../{ => default}/skills/pptx/LICENSE.txt     |   0
 examples/{ => default}/skills/pptx/SKILL.md   |   0
 examples/{ => default}/skills/pptx/editing.md |   0
 .../{ => default}/skills/pptx/pptxgenjs.md    |   0
 .../skills/pptx/scripts/__init__.py           |   0
 .../skills/pptx/scripts/add_slide.py          |   0
 .../skills/pptx/scripts/clean.py              |   0
 .../pptx/scripts/office/helpers/__init__.py   |   0
 .../pptx/scripts/office/helpers/merge_runs.py |   0
 .../office/helpers/simplify_redlines.py       |   0
 .../skills/pptx/scripts/office/pack.py        |   0
 .../schemas/ISO-IEC29500-4_2016/dml-chart.xsd |   0
 .../ISO-IEC29500-4_2016/dml-chartDrawing.xsd  |   0
 .../ISO-IEC29500-4_2016/dml-diagram.xsd       |   0
 .../ISO-IEC29500-4_2016/dml-lockedCanvas.xsd  |   0
 .../schemas/ISO-IEC29500-4_2016/dml-main.xsd  |   0
 .../ISO-IEC29500-4_2016/dml-picture.xsd       |   0
 .../dml-spreadsheetDrawing.xsd                |   0
 .../dml-wordprocessingDrawing.xsd             |   0
 .../schemas/ISO-IEC29500-4_2016/pml.xsd       |   0
 .../shared-additionalCharacteristics.xsd      |   0
 .../shared-bibliography.xsd                   |   0
 .../shared-commonSimpleTypes.xsd              |   0
 .../shared-customXmlDataProperties.xsd        |   0
 .../shared-customXmlSchemaProperties.xsd      |   0
 .../shared-documentPropertiesCustom.xsd       |   0
 .../shared-documentPropertiesExtended.xsd     |   0
 .../shared-documentPropertiesVariantTypes.xsd |   0
 .../ISO-IEC29500-4_2016/shared-math.xsd       |   0
 .../shared-relationshipReference.xsd          |   0
 .../schemas/ISO-IEC29500-4_2016/sml.xsd       |   0
 .../schemas/ISO-IEC29500-4_2016/vml-main.xsd  |   0
 .../ISO-IEC29500-4_2016/vml-officeDrawing.xsd |   0
 .../vml-presentationDrawing.xsd               |   0
 .../vml-spreadsheetDrawing.xsd                |   0
 .../vml-wordprocessingDrawing.xsd             |   0
 .../schemas/ISO-IEC29500-4_2016/wml.xsd       |   0
 .../schemas/ISO-IEC29500-4_2016/xml.xsd       |   0
 .../ecma/fouth-edition/opc-contentTypes.xsd   |   0
 .../ecma/fouth-edition/opc-coreProperties.xsd |   0
 .../schemas/ecma/fouth-edition/opc-digSig.xsd |   0
 .../ecma/fouth-edition/opc-relationships.xsd  |   0
 .../pptx/scripts/office/schemas/mce/mc.xsd    |   0
 .../office/schemas/microsoft/wml-2010.xsd     |   0
 .../office/schemas/microsoft/wml-2012.xsd     |   0
 .../office/schemas/microsoft/wml-2018.xsd     |   0
 .../office/schemas/microsoft/wml-cex-2018.xsd |   0
 .../office/schemas/microsoft/wml-cid-2016.xsd |   0
 .../microsoft/wml-sdtdatahash-2020.xsd        |   0
 .../schemas/microsoft/wml-symex-2015.xsd      |   0
 .../skills/pptx/scripts/office/soffice.py     |   0
 .../skills/pptx/scripts/office/unpack.py      |   0
 .../skills/pptx/scripts/office/validate.py    |   0
 .../scripts/office/validators/__init__.py     |   0
 .../pptx/scripts/office/validators/base.py    |   0
 .../pptx/scripts/office/validators/docx.py    |   0
 .../pptx/scripts/office/validators/pptx.py    |   0
 .../scripts/office/validators/redlining.py    |   0
 .../skills/pptx/scripts/thumbnail.py          |   0
 .../workflows/circuly/sync-customers.json     |   0
 .../workflows/circuly/sync-products.json      |   0
 .../workflows/circuly/sync-subscriptions.json |   0
 .../general/conversation-auto-archive.json    |   0
 .../workflows/general/conversation-sync.json  |   0
 .../general/customer-status-assessment.json   |   0
 .../workflows/general/document-rag-sync.json  |   0
 .../product-relationship-analysis.json        |   0
 .../workflows/gmail/email-sync.json           |   0
 .../workflows/google_drive/sync.json          |   0
 .../workflows/onedrive/sync.json              |   0
 .../workflows/outlook/email-sync.json         |   0
 .../workflows/shopify/sync-customers.json     |   0
 .../workflows/shopify/sync-products.json      |   0
 services/convex/Dockerfile                    |  12 +-
 services/convex/docker-entrypoint.sh          | 143 +++--
 .../constants/integration-templates.ts        |   2 +-
 services/platform/convex/_generated/api.d.ts  |   2 +
 services/platform/convex/agents/file_utils.ts |  16 +-
 .../platform/convex/branding/file_actions.ts  |  10 +-
 .../platform/convex/branding/file_utils.ts    |  18 +-
 .../platform/convex/branding/queries.test.ts  |  24 +-
 .../convex/governance/retention_actions.ts    |   4 +-
 .../governance/retention_bounds_proposal.ts   |   2 +-
 .../convex/governance/retention_floors.ts     |   2 +-
 .../convex/integrations/file_utils.ts         |  23 +-
 .../convex/lib/config_store/actions.ts        |   3 +
 .../platform/convex/lib/config_store/store.ts |  92 ++-
 .../gmail_draft_filtering.test.ts             |   2 +-
 .../outlook_draft_filtering.test.ts           |   2 +-
 .../convex/organizations/reseed_all_orgs.ts   | 106 ++++
 .../convex/organizations/scaffold.test.ts     | 433 ++++++++------
 .../platform/convex/organizations/scaffold.ts | 530 ++++++++++++------
 .../platform/convex/providers/file_utils.ts   |  14 +-
 .../platform/convex/skills/file_actions.ts    |   6 +-
 .../platform/convex/skills/file_utils.test.ts |  65 ++-
 services/platform/convex/skills/file_utils.ts |  29 +-
 .../platform/convex/workflows/file_utils.ts   |  22 +-
 services/platform/docker-entrypoint.sh        |  41 +-
 services/platform/env.sh                      |   9 +-
 services/platform/lib/config-watcher.ts       |  56 +-
 .../platform/lib/shared/schemas/governance.ts |   2 +-
 .../lib/shared/schemas/retention.test.ts      |   8 +-
 .../utils/example-agents-normalized.test.ts   |  20 +-
 services/platform/server.ts                   |   5 +-
 .../vite-plugins/serve-branding-images.ts     |   6 +-
 tools/cli/src/commands/deploy/index.ts        |  22 +-
 tools/cli/src/commands/migrate.ts             |  41 ++
 tools/cli/src/commands/start/index.ts         |  71 ++-
 tools/cli/src/index.ts                        |   2 +
 tools/cli/src/lib/actions/deploy.ts           | 302 +++++-----
 tools/cli/src/lib/actions/init.ts             | 110 ++--
 .../src/lib/actions/migrate-config-layout.ts  | 104 ++++
 tools/cli/src/lib/actions/reseed-all-orgs.ts  | 116 ++++
 tools/cli/src/lib/actions/start.ts            |  66 +--
 tools/cli/src/lib/actions/update.ts           |  21 +-
 tools/cli/src/lib/docker/exec.ts              |  37 +-
 .../src/lib/migrate-config-layout/script.sh   | 173 ++++++
 tools/cli/src/lib/project/fetch-reference.ts  |   8 +-
 .../migrations/adopt-convex-stateful.ts       |  67 ---
 .../migrations/namespace-caddy-config.ts      |  77 ---
 .../upgrade/migrations/namespace-volumes.ts   | 171 ------
 .../lib/upgrade/migrations/split-convex.ts    | 154 -----
 tools/cli/src/lib/upgrade/registry.ts         |  23 -
 tools/cli/src/lib/upgrade/runner.test.ts      | 230 --------
 tools/cli/src/lib/upgrade/runner.ts           | 199 -------
 tools/cli/src/lib/upgrade/state.ts            |  94 ----
 tools/cli/src/lib/upgrade/types.ts            |  71 ---
 tools/cli/src/lib/upgrade/volume-helpers.ts   | 333 -----------
 191 files changed, 1907 insertions(+), 2332 deletions(-)
 rename examples/{ => default}/agents/chat-agent.json (100%)
 rename examples/{ => default}/agents/crm-assistant.json (100%)
 rename examples/{ => default}/agents/image-creator.json (100%)
 rename examples/{ => default}/agents/integration-assistant.json (100%)
 rename examples/{ => default}/agents/researcher.json (100%)
 rename examples/{ => default}/agents/translator.json (100%)
 rename examples/{ => default}/agents/workflow-assistant.json (100%)
 rename examples/{ => default}/branding/branding.json (100%)
 rename examples/{ => default}/integrations/ai-image/config.json (100%)
 rename examples/{ => default}/integrations/ai-image/connector.ts (100%)
 rename examples/{ => default}/integrations/ai-image/icon.svg (100%)
 rename examples/{ => default}/integrations/circuly/config.json (100%)
 rename examples/{ => default}/integrations/circuly/connector.ts (100%)
 rename examples/{ => default}/integrations/circuly/icon.svg (100%)
 rename examples/{ => default}/integrations/discord/config.json (100%)
 rename examples/{ => default}/integrations/discord/connector.ts (100%)
 rename examples/{ => default}/integrations/discord/icon.svg (100%)
 rename examples/{ => default}/integrations/github/config.json (100%)
 rename examples/{ => default}/integrations/github/connector.ts (100%)
 rename examples/{ => default}/integrations/github/icon.svg (100%)
 rename examples/{ => default}/integrations/gmail/config.json (100%)
 rename examples/{ => default}/integrations/gmail/connector.ts (100%)
 rename examples/{ => default}/integrations/gmail/icon.svg (100%)
 rename examples/{ => default}/integrations/google_drive/config.json (100%)
 rename examples/{ => default}/integrations/google_drive/connector.ts (100%)
 rename examples/{ => default}/integrations/google_drive/icon.svg (100%)
 rename examples/{ => default}/integrations/outlook/config.json (100%)
 rename examples/{ => default}/integrations/outlook/connector.ts (100%)
 rename examples/{ => default}/integrations/outlook/icon.svg (100%)
 rename examples/{ => default}/integrations/protel/config.json (100%)
 rename examples/{ => default}/integrations/protel/icon.svg (100%)
 rename examples/{ => default}/integrations/shopify/config.json (100%)
 rename examples/{ => default}/integrations/shopify/connector.ts (100%)
 rename examples/{ => default}/integrations/shopify/icon.svg (100%)
 rename examples/{ => default}/integrations/slack/config.json (100%)
 rename examples/{ => default}/integrations/slack/connector.ts (100%)
 rename examples/{ => default}/integrations/slack/icon.svg (100%)
 rename examples/{ => default}/integrations/tavily/config.json (100%)
 rename examples/{ => default}/integrations/tavily/connector.ts (100%)
 rename examples/{ => default}/integrations/tavily/icon.svg (100%)
 rename examples/{ => default}/integrations/teams/config.json (100%)
 rename examples/{ => default}/integrations/teams/connector.ts (100%)
 rename examples/{ => default}/integrations/teams/icon.svg (100%)
 rename examples/{ => default}/integrations/twilio/config.json (100%)
 rename examples/{ => default}/integrations/twilio/connector.ts (100%)
 rename examples/{ => default}/integrations/twilio/icon.svg (100%)
 rename examples/{ => default}/providers/openai.json (100%)
 rename examples/{ => default}/providers/openrouter.json (100%)
 rename examples/{ => default}/providers/vercel-gateway.json (100%)
 rename examples/{retention/default.json => default/retention.json} (100%)
 rename examples/{ => default}/skills/pptx/LICENSE.txt (100%)
 rename examples/{ => default}/skills/pptx/SKILL.md (100%)
 rename examples/{ => default}/skills/pptx/editing.md (100%)
 rename examples/{ => default}/skills/pptx/pptxgenjs.md (100%)
 rename examples/{ => default}/skills/pptx/scripts/__init__.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/add_slide.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/clean.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/helpers/__init__.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/helpers/merge_runs.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/helpers/simplify_redlines.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/pack.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/mce/mc.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/soffice.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/unpack.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validate.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validators/__init__.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validators/base.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validators/docx.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validators/pptx.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/office/validators/redlining.py (100%)
 rename examples/{ => default}/skills/pptx/scripts/thumbnail.py (100%)
 rename examples/{ => default}/workflows/circuly/sync-customers.json (100%)
 rename examples/{ => default}/workflows/circuly/sync-products.json (100%)
 rename examples/{ => default}/workflows/circuly/sync-subscriptions.json (100%)
 rename examples/{ => default}/workflows/general/conversation-auto-archive.json (100%)
 rename examples/{ => default}/workflows/general/conversation-sync.json (100%)
 rename examples/{ => default}/workflows/general/customer-status-assessment.json (100%)
 rename examples/{ => default}/workflows/general/document-rag-sync.json (100%)
 rename examples/{ => default}/workflows/general/product-relationship-analysis.json (100%)
 rename examples/{ => default}/workflows/gmail/email-sync.json (100%)
 rename examples/{ => default}/workflows/google_drive/sync.json (100%)
 rename examples/{ => default}/workflows/onedrive/sync.json (100%)
 rename examples/{ => default}/workflows/outlook/email-sync.json (100%)
 rename examples/{ => default}/workflows/shopify/sync-customers.json (100%)
 rename examples/{ => default}/workflows/shopify/sync-products.json (100%)
 create mode 100644 services/platform/convex/organizations/reseed_all_orgs.ts
 create mode 100644 tools/cli/src/commands/migrate.ts
 create mode 100644 tools/cli/src/lib/actions/migrate-config-layout.ts
 create mode 100644 tools/cli/src/lib/actions/reseed-all-orgs.ts
 create mode 100644 tools/cli/src/lib/migrate-config-layout/script.sh
 delete mode 100644 tools/cli/src/lib/upgrade/migrations/adopt-convex-stateful.ts
 delete mode 100644 tools/cli/src/lib/upgrade/migrations/namespace-caddy-config.ts
 delete mode 100644 tools/cli/src/lib/upgrade/migrations/namespace-volumes.ts
 delete mode 100644 tools/cli/src/lib/upgrade/migrations/split-convex.ts
 delete mode 100644 tools/cli/src/lib/upgrade/registry.ts
 delete mode 100644 tools/cli/src/lib/upgrade/runner.test.ts
 delete mode 100644 tools/cli/src/lib/upgrade/runner.ts
 delete mode 100644 tools/cli/src/lib/upgrade/state.ts
 delete mode 100644 tools/cli/src/lib/upgrade/types.ts
 delete mode 100644 tools/cli/src/lib/upgrade/volume-helpers.ts
diff --git a/.dockerignore b/.dockerignore
index 6b2f4d3fec..4f2099eb36 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,7 @@ services/platform/.env*.local
 # Provider secrets
 # ============================================================================
 # *.secrets.json carries credentials in either form (SOPS-encrypted or
-# plaintext). The convex image ships seeds from examples/providers/, but
+# plaintext). The convex image ships seeds from examples/default/providers/, but
 # the secrets siblings should never bake into a layer — the entrypoint
 # already filters them at seed time, but exclude them from the build
 # context entirely so they cannot leak via image inspection.
diff --git a/docs/de/develop/integrations.md b/docs/de/develop/integrations.md
index e55bd8a932..fbc213ece1 100644
--- a/docs/de/develop/integrations.md
+++ b/docs/de/develop/integrations.md
@@ -49,7 +49,7 @@ Die Operation taucht auf Agents als Tool-Familie auf, sobald die Org Credentials
 | MCP-Server    | Die Brücke muss ein langlebiger Prozess sein — lokale Dateien, eine eigene CLI, ein System, das aus dem Netz von Tale unerreichbar ist. |
 | Connector-TS  | Das REST-Manifest deckt 80 % der API ab, aber eine Operation braucht Response-Formung, die das Manifest nicht deklarieren kann.         |
 
-Die ausgelieferten Integrations unter [Platform > Integrations](/de/platform/integrations/overview) sind der Katalog der REST-Manifeste, die Tale ausliefert — lies ihre Configs in `examples/integrations/` für die Muster, die du kopierst.
+Die ausgelieferten Integrations unter [Platform > Integrations](/de/platform/integrations/overview) sind der Katalog der REST-Manifeste, die Tale ausliefert — lies ihre Configs in `examples/default/integrations/` für die Muster, die du kopierst.
 
 ## SQL-Adapter
 
diff --git a/docs/de/platform/integrations/overview.md b/docs/de/platform/integrations/overview.md
index 6a7c3ea899..2b56f09d82 100644
--- a/docs/de/platform/integrations/overview.md
+++ b/docs/de/platform/integrations/overview.md
@@ -5,7 +5,7 @@ description: Drittsysteme, aus denen Tale liest und in die es schreibt — Kommu
 
 Integrationen sind die Brücken zwischen Tale und dem Rest deines Stacks. Agents rufen sie als Tools auf, Workflows triggern sie an Schritten, und die Dokumenten-Pipeline zieht Dateien aus ihnen. Jede Integration ist eine einzige JSON-Konfiguration plus eine Credential, die die Org einmal speichert; einmal verbunden, kann alles in Tale sie ohne erneute Authentifizierung nutzen. Diese Übersicht benennt die ausgelieferten Integrationen, gruppiert danach, was sie tun.
 
-Die Form einer Integration ist über jeden Eintrag unten gleich — eine OpenAI-kompatible REST-Oberfläche oder ein OAuth2-Tanz, mit in einer JSON-Konfiguration unter `examples/integrations/` deklarierten Operationen. Benutzerdefinierte Integrationen folgen derselben Form; eine Code-Änderung brauchst du nicht, um eine hinzuzufügen.
+Die Form einer Integration ist über jeden Eintrag unten gleich — eine OpenAI-kompatible REST-Oberfläche oder ein OAuth2-Tanz, mit in einer JSON-Konfiguration unter `examples/default/integrations/` deklarierten Operationen. Benutzerdefinierte Integrationen folgen derselben Form; eine Code-Änderung brauchst du nicht, um eine hinzuzufügen.
 
 ## Wie Integrationen sich von MCP unterscheiden
 
@@ -65,7 +65,7 @@ Microsoft 365 deckt auch Identität ab. Sie unter **Einstellungen > Integratione
 
 ## Eine eigene Integration hinzufügen
 
-Eigene Integrationen folgen derselben JSON-Form wie die oben. Leg eine Konfiguration in `TALE_CONFIG_DIR/integrations/<slug>/config.json` ab, die die Operationen, die Auth-Methode und die erlaubten Hosts deklariert; die Integration erscheint in **Einstellungen > Integrationen**, damit User sie verbinden können. Die Form und die Validierungsregeln leben neben den ausgelieferten Konfigurationen in `examples/integrations/`.
+Eigene Integrationen folgen derselben JSON-Form wie die oben. Leg eine Konfiguration in `TALE_CONFIG_DIR/integrations/<slug>/config.json` ab, die die Operationen, die Auth-Methode und die erlaubten Hosts deklariert; die Integration erscheint in **Einstellungen > Integrationen**, damit User sie verbinden können. Die Form und die Validierungsregeln leben neben den ausgelieferten Konfigurationen in `examples/default/integrations/`.
 
 Für reichere oder selbst gehostete Brücken sind [MCP-Server](/de/platform/integrations/mcp-servers) die alternative Oberfläche — jeder MCP-Server, den du registrierst, fügt seine Tools dem Agent-Werkzeuggürtel hinzu mit pro-Tool-Genehmigung.
 
diff --git a/docs/de/platform/models.md b/docs/de/platform/models.md
index f0e582366b..60c060bad8 100644
--- a/docs/de/platform/models.md
+++ b/docs/de/platform/models.md
@@ -3,9 +3,9 @@ title: Modelle out of the box
 description: Welche Provider und Modelle eine frische Tale-Instanz mitbringt — OpenRouter für Chat und Vision, OpenAI für Sprache, Vercel AI Gateway für Bildgenerierung.
 ---
 
-Eine frische Tale-Instanz bringt drei konfigurierte Provider mit: OpenRouter für Chat, Vision und Embeddings; OpenAI für Speech-to-Text und Text-to-Speech; Vercel AI Gateway für Bildgenerierung. Die Default-Agents in `examples/agents/` greifen auf Modelle in einem dieser drei Buckets zu, und die meisten Teams bleiben wochenlang bei den Defaults, bevor sie etwas tauschen. Diese Seite listet, was ausgeliefert wird, und verlinkt auf den vollen Katalog jedes Providers.
+Eine frische Tale-Instanz bringt drei konfigurierte Provider mit: OpenRouter für Chat, Vision und Embeddings; OpenAI für Speech-to-Text und Text-to-Speech; Vercel AI Gateway für Bildgenerierung. Die Default-Agents in `examples/default/agents/` greifen auf Modelle in einem dieser drei Buckets zu, und die meisten Teams bleiben wochenlang bei den Defaults, bevor sie etwas tauschen. Diese Seite listet, was ausgeliefert wird, und verlinkt auf den vollen Katalog jedes Providers.
 
-Modelle driften schneller als Docs. Die Listen unten stimmen zum Zeitpunkt, an dem `examples/providers/*.json` geschrieben wurde; die kanonische Wahrheit sind die JSON-Dateien, und das kanonische „was heute erreichbar ist" zeigt die Seite **Einstellungen > Provider** auf deiner Instanz.
+Modelle driften schneller als Docs. Die Listen unten stimmen zum Zeitpunkt, an dem `examples/default/providers/*.json` geschrieben wurde; die kanonische Wahrheit sind die JSON-Dateien, und das kanonische „was heute erreichbar ist" zeigt die Seite **Einstellungen > Provider** auf deiner Instanz.
 
 ## Die drei Provider
 
diff --git a/docs/de/self-hosted/configuration/providers.md b/docs/de/self-hosted/configuration/providers.md
index de714067a9..f8e904321d 100644
--- a/docs/de/self-hosted/configuration/providers.md
+++ b/docs/de/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ Die Referenz ist das Dateiformat auf Platte und die Reihenfolge der Operationen,
 }
 ```
 
-Die vollständige Menge der Felder lebt in [`examples/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` und `vercel-gateway.json` decken die drei Formen ab, die du wahrscheinlich brauchst.
+Die vollständige Menge der Felder lebt in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` und `vercel-gateway.json` decken die drei Formen ab, die du wahrscheinlich brauchst.
 
 ## Die Secrets-Datei
 
diff --git a/docs/en/develop/integrations.md b/docs/en/develop/integrations.md
index b24f5e5aa7..7a6a195ecc 100644
--- a/docs/en/develop/integrations.md
+++ b/docs/en/develop/integrations.md
@@ -49,7 +49,7 @@ The operation surfaces on agents as a tool family the moment the org connects cr
 | MCP server    | The bridge needs to be a long-lived process — local files, a CLI you own, a system that cannot be reached from Tale's network.        |
 | Connector TS  | The REST manifest covers 80 % of the API but one operation needs response shaping the manifest cannot declare.                        |
 
-The shipped integrations under [Platform > Integrations](/platform/integrations/overview) are the catalogue of REST manifests Tale ships — read their configs in `examples/integrations/` for the patterns you will copy.
+The shipped integrations under [Platform > Integrations](/platform/integrations/overview) are the catalogue of REST manifests Tale ships — read their configs in `examples/default/integrations/` for the patterns you will copy.
 
 ## SQL adapters
 
diff --git a/docs/en/platform/integrations/overview.md b/docs/en/platform/integrations/overview.md
index 937f93f647..788d78febb 100644
--- a/docs/en/platform/integrations/overview.md
+++ b/docs/en/platform/integrations/overview.md
@@ -5,7 +5,7 @@ description: Third-party systems Tale can read from and write to — communicati
 
 Integrations are the bridges between Tale and the rest of your stack. Agents call them as tools, workflows trigger them at steps, and the documents pipeline pulls files from them. Each integration is a single JSON config plus a credential the org stores once; once connected, anything in Tale can use it without re-authentication. This overview names the shipped integrations grouped by what they do.
 
-The shape of an integration is the same across every entry below — an OpenAI-compatible REST surface or an OAuth2 dance, with operations declared in a JSON config under `examples/integrations/`. Custom integrations follow the same shape; you do not need a code change to add one.
+The shape of an integration is the same across every entry below — an OpenAI-compatible REST surface or an OAuth2 dance, with operations declared in a JSON config under `examples/default/integrations/`. Custom integrations follow the same shape; you do not need a code change to add one.
 
 ## How integrations differ from MCP
 
@@ -65,7 +65,7 @@ Microsoft 365 also covers identity. Connecting it under **Settings > Integration
 
 ## Adding a custom integration
 
-Custom integrations follow the same JSON shape as the ones above. Drop a config into `TALE_CONFIG_DIR/integrations/<slug>/config.json` declaring the operations, auth method, and allowed hosts; the integration appears in **Settings > Integrations** for users to connect. The shape and validation rules live alongside the shipped configs in `examples/integrations/`.
+Custom integrations follow the same JSON shape as the ones above. Drop a config into `TALE_CONFIG_DIR/integrations/<slug>/config.json` declaring the operations, auth method, and allowed hosts; the integration appears in **Settings > Integrations** for users to connect. The shape and validation rules live alongside the shipped configs in `examples/default/integrations/`.
 
 For richer or self-hosted bridges, [MCP servers](/platform/integrations/mcp-servers) are the alternative surface — every MCP server you register adds its tools to the agent toolbelt with per-tool approval.
 
diff --git a/docs/en/platform/models.md b/docs/en/platform/models.md
index ab2e8e46b9..a6d94e6b83 100644
--- a/docs/en/platform/models.md
+++ b/docs/en/platform/models.md
@@ -3,9 +3,9 @@ title: Models out of the box
 description: Which providers and models a fresh Tale instance ships with — OpenRouter for chat and vision, OpenAI for voice, Vercel AI Gateway for image generation.
 ---
 
-A fresh Tale instance ships with three providers configured: OpenRouter for chat, vision, and embeddings; OpenAI for speech-to-text and text-to-speech; Vercel AI Gateway for image generation. The default agents in `examples/agents/` reach for models in one of those three buckets, and most teams stay on the defaults for weeks before swapping anything. This page lists what is shipped and links to each provider's full catalogue.
+A fresh Tale instance ships with three providers configured: OpenRouter for chat, vision, and embeddings; OpenAI for speech-to-text and text-to-speech; Vercel AI Gateway for image generation. The default agents in `examples/default/agents/` reach for models in one of those three buckets, and most teams stay on the defaults for weeks before swapping anything. This page lists what is shipped and links to each provider's full catalogue.
 
-Models drift faster than docs. The lists below are correct at the time `examples/providers/*.json` was written; the canonical truth is the JSON files, and the canonical "what is reachable today" is what the **Settings > Providers** page shows on your instance.
+Models drift faster than docs. The lists below are correct at the time `examples/default/providers/*.json` was written; the canonical truth is the JSON files, and the canonical "what is reachable today" is what the **Settings > Providers** page shows on your instance.
 
 ## The three providers
 
diff --git a/docs/en/self-hosted/configuration/providers.md b/docs/en/self-hosted/configuration/providers.md
index fe9f005d32..799f97a0f2 100644
--- a/docs/en/self-hosted/configuration/providers.md
+++ b/docs/en/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ The reference is the file format on disk and the order operations follow when ad
 }
 ```
 
-The full set of fields lives in [`examples/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json`, and `vercel-gateway.json` cover the three shapes you are likely to need.
+The full set of fields lives in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json`, and `vercel-gateway.json` cover the three shapes you are likely to need.
 
 ## The secrets file
 
diff --git a/docs/fr/develop/integrations.md b/docs/fr/develop/integrations.md
index bf7393aebe..0b266ed580 100644
--- a/docs/fr/develop/integrations.md
+++ b/docs/fr/develop/integrations.md
@@ -49,7 +49,7 @@ L'operation apparaît sur les agents comme une famille de tools dès que l'org b
 | Serveur MCP    | Le pont doit être un processus de longue durée — fichiers locaux, une CLI à toi, un système inatteignable depuis le réseau de Tale.  |
 | Connecteur TS  | Le manifeste REST couvre 80 % de l'API mais une operation a besoin d'une mise en forme que le manifeste ne sait pas déclarer.        |
 
-Les intégrations livrées sous [Platform > Intégrations](/fr/platform/integrations/overview) sont le catalogue des manifestes REST que Tale livre — lis leurs configs dans `examples/integrations/` pour les motifs que tu copieras.
+Les intégrations livrées sous [Platform > Intégrations](/fr/platform/integrations/overview) sont le catalogue des manifestes REST que Tale livre — lis leurs configs dans `examples/default/integrations/` pour les motifs que tu copieras.
 
 ## Adaptateurs SQL
 
diff --git a/docs/fr/platform/integrations/overview.md b/docs/fr/platform/integrations/overview.md
index ab3b7c0e15..cd4d1ecefb 100644
--- a/docs/fr/platform/integrations/overview.md
+++ b/docs/fr/platform/integrations/overview.md
@@ -5,7 +5,7 @@ description: Systèmes tiers que Tale lit et écrit — communication, stockage,
 
 Les intégrations sont les ponts entre Tale et le reste de ta pile. Les agents les appellent comme outils, les workflows les déclenchent à des étapes, et la pipeline de documents en tire des fichiers. Chaque intégration est une seule configuration JSON plus un identifiant que l'organisation enregistre une fois ; une fois connectée, n'importe quoi dans Tale peut l'utiliser sans nouvelle authentification. Cet aperçu nomme les intégrations livrées, groupées par ce qu'elles font.
 
-La forme d'une intégration est la même pour chaque entrée ci-dessous — une surface REST compatible OpenAI ou une danse OAuth2, avec des opérations déclarées dans une configuration JSON sous `examples/integrations/`. Les intégrations personnalisées suivent la même forme ; tu n'as pas besoin de modifier le code pour en ajouter une.
+La forme d'une intégration est la même pour chaque entrée ci-dessous — une surface REST compatible OpenAI ou une danse OAuth2, avec des opérations déclarées dans une configuration JSON sous `examples/default/integrations/`. Les intégrations personnalisées suivent la même forme ; tu n'as pas besoin de modifier le code pour en ajouter une.
 
 ## En quoi les intégrations diffèrent de MCP
 
@@ -65,7 +65,7 @@ Microsoft 365 couvre aussi l'identité. La connecter sous **Paramètres > Intég
 
 ## Ajouter une intégration personnalisée
 
-Les intégrations personnalisées suivent la même forme JSON que celles ci-dessus. Dépose une configuration dans `TALE_CONFIG_DIR/integrations/<slug>/config.json` déclarant les opérations, la méthode d'auth et les hôtes autorisés ; l'intégration apparaît sous **Paramètres > Intégrations** pour que les utilisateurs la connectent. La forme et les règles de validation vivent à côté des configurations livrées dans `examples/integrations/`.
+Les intégrations personnalisées suivent la même forme JSON que celles ci-dessus. Dépose une configuration dans `TALE_CONFIG_DIR/integrations/<slug>/config.json` déclarant les opérations, la méthode d'auth et les hôtes autorisés ; l'intégration apparaît sous **Paramètres > Intégrations** pour que les utilisateurs la connectent. La forme et les règles de validation vivent à côté des configurations livrées dans `examples/default/integrations/`.
 
 Pour des ponts plus riches ou auto-hébergés, les [serveurs MCP](/fr/platform/integrations/mcp-servers) sont la surface alternative — chaque serveur MCP que tu enregistres ajoute ses outils à la ceinture d'outils de l'agent avec approbation par outil.
 
diff --git a/docs/fr/platform/models.md b/docs/fr/platform/models.md
index 84cddaf9f5..ba1180e796 100644
--- a/docs/fr/platform/models.md
+++ b/docs/fr/platform/models.md
@@ -3,9 +3,9 @@ title: Modèles livrés en standard
 description: Quels fournisseurs et modèles une instance Tale toute neuve embarque — OpenRouter pour le chat et la vision, OpenAI pour la voix, Vercel AI Gateway pour la génération d'images.
 ---
 
-Une instance Tale toute neuve embarque trois fournisseurs configurés : OpenRouter pour le chat, la vision et les embeddings ; OpenAI pour la reconnaissance et la synthèse vocales ; Vercel AI Gateway pour la génération d'images. Les agents par défaut dans `examples/agents/` puisent dans l'un de ces trois seaux, et la plupart des équipes restent sur les défauts pendant des semaines avant d'en changer. Cette page liste ce qui est livré et renvoie vers le catalogue complet de chaque fournisseur.
+Une instance Tale toute neuve embarque trois fournisseurs configurés : OpenRouter pour le chat, la vision et les embeddings ; OpenAI pour la reconnaissance et la synthèse vocales ; Vercel AI Gateway pour la génération d'images. Les agents par défaut dans `examples/default/agents/` puisent dans l'un de ces trois seaux, et la plupart des équipes restent sur les défauts pendant des semaines avant d'en changer. Cette page liste ce qui est livré et renvoie vers le catalogue complet de chaque fournisseur.
 
-Les modèles dérivent plus vite que la doc. Les listes ci-dessous sont correctes au moment où `examples/providers/*.json` a été écrit ; la vérité canonique, ce sont les fichiers JSON, et le « ce qui est joignable aujourd'hui » canonique est ce que montre la page **Paramètres > Providers** sur ton instance.
+Les modèles dérivent plus vite que la doc. Les listes ci-dessous sont correctes au moment où `examples/default/providers/*.json` a été écrit ; la vérité canonique, ce sont les fichiers JSON, et le « ce qui est joignable aujourd'hui » canonique est ce que montre la page **Paramètres > Providers** sur ton instance.
 
 ## Les trois fournisseurs
 
diff --git a/docs/fr/self-hosted/configuration/providers.md b/docs/fr/self-hosted/configuration/providers.md
index 7f73ea9d34..a63161119c 100644
--- a/docs/fr/self-hosted/configuration/providers.md
+++ b/docs/fr/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ La référence est le format de fichier sur disque et l'ordre des opérations à
 }
 ```
 
-L'ensemble complet des champs vit dans [`examples/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` et `vercel-gateway.json` couvrent les trois formes dont tu auras probablement besoin.
+L'ensemble complet des champs vit dans [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` et `vercel-gateway.json` couvrent les trois formes dont tu auras probablement besoin.
 
 ## Le fichier de secrets
 
diff --git a/examples/agents/chat-agent.json b/examples/default/agents/chat-agent.json
similarity index 100%
rename from examples/agents/chat-agent.json
rename to examples/default/agents/chat-agent.json
diff --git a/examples/agents/crm-assistant.json b/examples/default/agents/crm-assistant.json
similarity index 100%
rename from examples/agents/crm-assistant.json
rename to examples/default/agents/crm-assistant.json
diff --git a/examples/agents/image-creator.json b/examples/default/agents/image-creator.json
similarity index 100%
rename from examples/agents/image-creator.json
rename to examples/default/agents/image-creator.json
diff --git a/examples/agents/integration-assistant.json b/examples/default/agents/integration-assistant.json
similarity index 100%
rename from examples/agents/integration-assistant.json
rename to examples/default/agents/integration-assistant.json
diff --git a/examples/agents/researcher.json b/examples/default/agents/researcher.json
similarity index 100%
rename from examples/agents/researcher.json
rename to examples/default/agents/researcher.json
diff --git a/examples/agents/translator.json b/examples/default/agents/translator.json
similarity index 100%
rename from examples/agents/translator.json
rename to examples/default/agents/translator.json
diff --git a/examples/agents/workflow-assistant.json b/examples/default/agents/workflow-assistant.json
similarity index 100%
rename from examples/agents/workflow-assistant.json
rename to examples/default/agents/workflow-assistant.json
diff --git a/examples/branding/branding.json b/examples/default/branding/branding.json
similarity index 100%
rename from examples/branding/branding.json
rename to examples/default/branding/branding.json
diff --git a/examples/integrations/ai-image/config.json b/examples/default/integrations/ai-image/config.json
similarity index 100%
rename from examples/integrations/ai-image/config.json
rename to examples/default/integrations/ai-image/config.json
diff --git a/examples/integrations/ai-image/connector.ts b/examples/default/integrations/ai-image/connector.ts
similarity index 100%
rename from examples/integrations/ai-image/connector.ts
rename to examples/default/integrations/ai-image/connector.ts
diff --git a/examples/integrations/ai-image/icon.svg b/examples/default/integrations/ai-image/icon.svg
similarity index 100%
rename from examples/integrations/ai-image/icon.svg
rename to examples/default/integrations/ai-image/icon.svg
diff --git a/examples/integrations/circuly/config.json b/examples/default/integrations/circuly/config.json
similarity index 100%
rename from examples/integrations/circuly/config.json
rename to examples/default/integrations/circuly/config.json
diff --git a/examples/integrations/circuly/connector.ts b/examples/default/integrations/circuly/connector.ts
similarity index 100%
rename from examples/integrations/circuly/connector.ts
rename to examples/default/integrations/circuly/connector.ts
diff --git a/examples/integrations/circuly/icon.svg b/examples/default/integrations/circuly/icon.svg
similarity index 100%
rename from examples/integrations/circuly/icon.svg
rename to examples/default/integrations/circuly/icon.svg
diff --git a/examples/integrations/discord/config.json b/examples/default/integrations/discord/config.json
similarity index 100%
rename from examples/integrations/discord/config.json
rename to examples/default/integrations/discord/config.json
diff --git a/examples/integrations/discord/connector.ts b/examples/default/integrations/discord/connector.ts
similarity index 100%
rename from examples/integrations/discord/connector.ts
rename to examples/default/integrations/discord/connector.ts
diff --git a/examples/integrations/discord/icon.svg b/examples/default/integrations/discord/icon.svg
similarity index 100%
rename from examples/integrations/discord/icon.svg
rename to examples/default/integrations/discord/icon.svg
diff --git a/examples/integrations/github/config.json b/examples/default/integrations/github/config.json
similarity index 100%
rename from examples/integrations/github/config.json
rename to examples/default/integrations/github/config.json
diff --git a/examples/integrations/github/connector.ts b/examples/default/integrations/github/connector.ts
similarity index 100%
rename from examples/integrations/github/connector.ts
rename to examples/default/integrations/github/connector.ts
diff --git a/examples/integrations/github/icon.svg b/examples/default/integrations/github/icon.svg
similarity index 100%
rename from examples/integrations/github/icon.svg
rename to examples/default/integrations/github/icon.svg
diff --git a/examples/integrations/gmail/config.json b/examples/default/integrations/gmail/config.json
similarity index 100%
rename from examples/integrations/gmail/config.json
rename to examples/default/integrations/gmail/config.json
diff --git a/examples/integrations/gmail/connector.ts b/examples/default/integrations/gmail/connector.ts
similarity index 100%
rename from examples/integrations/gmail/connector.ts
rename to examples/default/integrations/gmail/connector.ts
diff --git a/examples/integrations/gmail/icon.svg b/examples/default/integrations/gmail/icon.svg
similarity index 100%
rename from examples/integrations/gmail/icon.svg
rename to examples/default/integrations/gmail/icon.svg
diff --git a/examples/integrations/google_drive/config.json b/examples/default/integrations/google_drive/config.json
similarity index 100%
rename from examples/integrations/google_drive/config.json
rename to examples/default/integrations/google_drive/config.json
diff --git a/examples/integrations/google_drive/connector.ts b/examples/default/integrations/google_drive/connector.ts
similarity index 100%
rename from examples/integrations/google_drive/connector.ts
rename to examples/default/integrations/google_drive/connector.ts
diff --git a/examples/integrations/google_drive/icon.svg b/examples/default/integrations/google_drive/icon.svg
similarity index 100%
rename from examples/integrations/google_drive/icon.svg
rename to examples/default/integrations/google_drive/icon.svg
diff --git a/examples/integrations/outlook/config.json b/examples/default/integrations/outlook/config.json
similarity index 100%
rename from examples/integrations/outlook/config.json
rename to examples/default/integrations/outlook/config.json
diff --git a/examples/integrations/outlook/connector.ts b/examples/default/integrations/outlook/connector.ts
similarity index 100%
rename from examples/integrations/outlook/connector.ts
rename to examples/default/integrations/outlook/connector.ts
diff --git a/examples/integrations/outlook/icon.svg b/examples/default/integrations/outlook/icon.svg
similarity index 100%
rename from examples/integrations/outlook/icon.svg
rename to examples/default/integrations/outlook/icon.svg
diff --git a/examples/integrations/protel/config.json b/examples/default/integrations/protel/config.json
similarity index 100%
rename from examples/integrations/protel/config.json
rename to examples/default/integrations/protel/config.json
diff --git a/examples/integrations/protel/icon.svg b/examples/default/integrations/protel/icon.svg
similarity index 100%
rename from examples/integrations/protel/icon.svg
rename to examples/default/integrations/protel/icon.svg
diff --git a/examples/integrations/shopify/config.json b/examples/default/integrations/shopify/config.json
similarity index 100%
rename from examples/integrations/shopify/config.json
rename to examples/default/integrations/shopify/config.json
diff --git a/examples/integrations/shopify/connector.ts b/examples/default/integrations/shopify/connector.ts
similarity index 100%
rename from examples/integrations/shopify/connector.ts
rename to examples/default/integrations/shopify/connector.ts
diff --git a/examples/integrations/shopify/icon.svg b/examples/default/integrations/shopify/icon.svg
similarity index 100%
rename from examples/integrations/shopify/icon.svg
rename to examples/default/integrations/shopify/icon.svg
diff --git a/examples/integrations/slack/config.json b/examples/default/integrations/slack/config.json
similarity index 100%
rename from examples/integrations/slack/config.json
rename to examples/default/integrations/slack/config.json
diff --git a/examples/integrations/slack/connector.ts b/examples/default/integrations/slack/connector.ts
similarity index 100%
rename from examples/integrations/slack/connector.ts
rename to examples/default/integrations/slack/connector.ts
diff --git a/examples/integrations/slack/icon.svg b/examples/default/integrations/slack/icon.svg
similarity index 100%
rename from examples/integrations/slack/icon.svg
rename to examples/default/integrations/slack/icon.svg
diff --git a/examples/integrations/tavily/config.json b/examples/default/integrations/tavily/config.json
similarity index 100%
rename from examples/integrations/tavily/config.json
rename to examples/default/integrations/tavily/config.json
diff --git a/examples/integrations/tavily/connector.ts b/examples/default/integrations/tavily/connector.ts
similarity index 100%
rename from examples/integrations/tavily/connector.ts
rename to examples/default/integrations/tavily/connector.ts
diff --git a/examples/integrations/tavily/icon.svg b/examples/default/integrations/tavily/icon.svg
similarity index 100%
rename from examples/integrations/tavily/icon.svg
rename to examples/default/integrations/tavily/icon.svg
diff --git a/examples/integrations/teams/config.json b/examples/default/integrations/teams/config.json
similarity index 100%
rename from examples/integrations/teams/config.json
rename to examples/default/integrations/teams/config.json
diff --git a/examples/integrations/teams/connector.ts b/examples/default/integrations/teams/connector.ts
similarity index 100%
rename from examples/integrations/teams/connector.ts
rename to examples/default/integrations/teams/connector.ts
diff --git a/examples/integrations/teams/icon.svg b/examples/default/integrations/teams/icon.svg
similarity index 100%
rename from examples/integrations/teams/icon.svg
rename to examples/default/integrations/teams/icon.svg
diff --git a/examples/integrations/twilio/config.json b/examples/default/integrations/twilio/config.json
similarity index 100%
rename from examples/integrations/twilio/config.json
rename to examples/default/integrations/twilio/config.json
diff --git a/examples/integrations/twilio/connector.ts b/examples/default/integrations/twilio/connector.ts
similarity index 100%
rename from examples/integrations/twilio/connector.ts
rename to examples/default/integrations/twilio/connector.ts
diff --git a/examples/integrations/twilio/icon.svg b/examples/default/integrations/twilio/icon.svg
similarity index 100%
rename from examples/integrations/twilio/icon.svg
rename to examples/default/integrations/twilio/icon.svg
diff --git a/examples/providers/openai.json b/examples/default/providers/openai.json
similarity index 100%
rename from examples/providers/openai.json
rename to examples/default/providers/openai.json
diff --git a/examples/providers/openrouter.json b/examples/default/providers/openrouter.json
similarity index 100%
rename from examples/providers/openrouter.json
rename to examples/default/providers/openrouter.json
diff --git a/examples/providers/vercel-gateway.json b/examples/default/providers/vercel-gateway.json
similarity index 100%
rename from examples/providers/vercel-gateway.json
rename to examples/default/providers/vercel-gateway.json
diff --git a/examples/retention/default.json b/examples/default/retention.json
similarity index 100%
rename from examples/retention/default.json
rename to examples/default/retention.json
diff --git a/examples/skills/pptx/LICENSE.txt b/examples/default/skills/pptx/LICENSE.txt
similarity index 100%
rename from examples/skills/pptx/LICENSE.txt
rename to examples/default/skills/pptx/LICENSE.txt
diff --git a/examples/skills/pptx/SKILL.md b/examples/default/skills/pptx/SKILL.md
similarity index 100%
rename from examples/skills/pptx/SKILL.md
rename to examples/default/skills/pptx/SKILL.md
diff --git a/examples/skills/pptx/editing.md b/examples/default/skills/pptx/editing.md
similarity index 100%
rename from examples/skills/pptx/editing.md
rename to examples/default/skills/pptx/editing.md
diff --git a/examples/skills/pptx/pptxgenjs.md b/examples/default/skills/pptx/pptxgenjs.md
similarity index 100%
rename from examples/skills/pptx/pptxgenjs.md
rename to examples/default/skills/pptx/pptxgenjs.md
diff --git a/examples/skills/pptx/scripts/__init__.py b/examples/default/skills/pptx/scripts/__init__.py
similarity index 100%
rename from examples/skills/pptx/scripts/__init__.py
rename to examples/default/skills/pptx/scripts/__init__.py
diff --git a/examples/skills/pptx/scripts/add_slide.py b/examples/default/skills/pptx/scripts/add_slide.py
similarity index 100%
rename from examples/skills/pptx/scripts/add_slide.py
rename to examples/default/skills/pptx/scripts/add_slide.py
diff --git a/examples/skills/pptx/scripts/clean.py b/examples/default/skills/pptx/scripts/clean.py
similarity index 100%
rename from examples/skills/pptx/scripts/clean.py
rename to examples/default/skills/pptx/scripts/clean.py
diff --git a/examples/skills/pptx/scripts/office/helpers/__init__.py b/examples/default/skills/pptx/scripts/office/helpers/__init__.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/helpers/__init__.py
rename to examples/default/skills/pptx/scripts/office/helpers/__init__.py
diff --git a/examples/skills/pptx/scripts/office/helpers/merge_runs.py b/examples/default/skills/pptx/scripts/office/helpers/merge_runs.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/helpers/merge_runs.py
rename to examples/default/skills/pptx/scripts/office/helpers/merge_runs.py
diff --git a/examples/skills/pptx/scripts/office/helpers/simplify_redlines.py b/examples/default/skills/pptx/scripts/office/helpers/simplify_redlines.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/helpers/simplify_redlines.py
rename to examples/default/skills/pptx/scripts/office/helpers/simplify_redlines.py
diff --git a/examples/skills/pptx/scripts/office/pack.py b/examples/default/skills/pptx/scripts/office/pack.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/pack.py
rename to examples/default/skills/pptx/scripts/office/pack.py
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/mce/mc.xsd b/examples/default/skills/pptx/scripts/office/schemas/mce/mc.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/mce/mc.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/mce/mc.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd
diff --git a/examples/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd
similarity index 100%
rename from examples/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd
rename to examples/default/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd
diff --git a/examples/skills/pptx/scripts/office/soffice.py b/examples/default/skills/pptx/scripts/office/soffice.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/soffice.py
rename to examples/default/skills/pptx/scripts/office/soffice.py
diff --git a/examples/skills/pptx/scripts/office/unpack.py b/examples/default/skills/pptx/scripts/office/unpack.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/unpack.py
rename to examples/default/skills/pptx/scripts/office/unpack.py
diff --git a/examples/skills/pptx/scripts/office/validate.py b/examples/default/skills/pptx/scripts/office/validate.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validate.py
rename to examples/default/skills/pptx/scripts/office/validate.py
diff --git a/examples/skills/pptx/scripts/office/validators/__init__.py b/examples/default/skills/pptx/scripts/office/validators/__init__.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validators/__init__.py
rename to examples/default/skills/pptx/scripts/office/validators/__init__.py
diff --git a/examples/skills/pptx/scripts/office/validators/base.py b/examples/default/skills/pptx/scripts/office/validators/base.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validators/base.py
rename to examples/default/skills/pptx/scripts/office/validators/base.py
diff --git a/examples/skills/pptx/scripts/office/validators/docx.py b/examples/default/skills/pptx/scripts/office/validators/docx.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validators/docx.py
rename to examples/default/skills/pptx/scripts/office/validators/docx.py
diff --git a/examples/skills/pptx/scripts/office/validators/pptx.py b/examples/default/skills/pptx/scripts/office/validators/pptx.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validators/pptx.py
rename to examples/default/skills/pptx/scripts/office/validators/pptx.py
diff --git a/examples/skills/pptx/scripts/office/validators/redlining.py b/examples/default/skills/pptx/scripts/office/validators/redlining.py
similarity index 100%
rename from examples/skills/pptx/scripts/office/validators/redlining.py
rename to examples/default/skills/pptx/scripts/office/validators/redlining.py
diff --git a/examples/skills/pptx/scripts/thumbnail.py b/examples/default/skills/pptx/scripts/thumbnail.py
similarity index 100%
rename from examples/skills/pptx/scripts/thumbnail.py
rename to examples/default/skills/pptx/scripts/thumbnail.py
diff --git a/examples/workflows/circuly/sync-customers.json b/examples/default/workflows/circuly/sync-customers.json
similarity index 100%
rename from examples/workflows/circuly/sync-customers.json
rename to examples/default/workflows/circuly/sync-customers.json
diff --git a/examples/workflows/circuly/sync-products.json b/examples/default/workflows/circuly/sync-products.json
similarity index 100%
rename from examples/workflows/circuly/sync-products.json
rename to examples/default/workflows/circuly/sync-products.json
diff --git a/examples/workflows/circuly/sync-subscriptions.json b/examples/default/workflows/circuly/sync-subscriptions.json
similarity index 100%
rename from examples/workflows/circuly/sync-subscriptions.json
rename to examples/default/workflows/circuly/sync-subscriptions.json
diff --git a/examples/workflows/general/conversation-auto-archive.json b/examples/default/workflows/general/conversation-auto-archive.json
similarity index 100%
rename from examples/workflows/general/conversation-auto-archive.json
rename to examples/default/workflows/general/conversation-auto-archive.json
diff --git a/examples/workflows/general/conversation-sync.json b/examples/default/workflows/general/conversation-sync.json
similarity index 100%
rename from examples/workflows/general/conversation-sync.json
rename to examples/default/workflows/general/conversation-sync.json
diff --git a/examples/workflows/general/customer-status-assessment.json b/examples/default/workflows/general/customer-status-assessment.json
similarity index 100%
rename from examples/workflows/general/customer-status-assessment.json
rename to examples/default/workflows/general/customer-status-assessment.json
diff --git a/examples/workflows/general/document-rag-sync.json b/examples/default/workflows/general/document-rag-sync.json
similarity index 100%
rename from examples/workflows/general/document-rag-sync.json
rename to examples/default/workflows/general/document-rag-sync.json
diff --git a/examples/workflows/general/product-relationship-analysis.json b/examples/default/workflows/general/product-relationship-analysis.json
similarity index 100%
rename from examples/workflows/general/product-relationship-analysis.json
rename to examples/default/workflows/general/product-relationship-analysis.json
diff --git a/examples/workflows/gmail/email-sync.json b/examples/default/workflows/gmail/email-sync.json
similarity index 100%
rename from examples/workflows/gmail/email-sync.json
rename to examples/default/workflows/gmail/email-sync.json
diff --git a/examples/workflows/google_drive/sync.json b/examples/default/workflows/google_drive/sync.json
similarity index 100%
rename from examples/workflows/google_drive/sync.json
rename to examples/default/workflows/google_drive/sync.json
diff --git a/examples/workflows/onedrive/sync.json b/examples/default/workflows/onedrive/sync.json
similarity index 100%
rename from examples/workflows/onedrive/sync.json
rename to examples/default/workflows/onedrive/sync.json
diff --git a/examples/workflows/outlook/email-sync.json b/examples/default/workflows/outlook/email-sync.json
similarity index 100%
rename from examples/workflows/outlook/email-sync.json
rename to examples/default/workflows/outlook/email-sync.json
diff --git a/examples/workflows/shopify/sync-customers.json b/examples/default/workflows/shopify/sync-customers.json
similarity index 100%
rename from examples/workflows/shopify/sync-customers.json
rename to examples/default/workflows/shopify/sync-customers.json
diff --git a/examples/workflows/shopify/sync-products.json b/examples/default/workflows/shopify/sync-products.json
similarity index 100%
rename from examples/workflows/shopify/sync-products.json
rename to examples/default/workflows/shopify/sync-products.json
diff --git a/services/convex/Dockerfile b/services/convex/Dockerfile
index 0a19100dbf..6ad955e277 100644
--- a/services/convex/Dockerfile
+++ b/services/convex/Dockerfile
@@ -105,9 +105,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     && groupadd --system --gid 1001 app || true \
     && useradd --system --uid 1001 --gid app app || true \
     && mkdir -p /home/app && chown app:app /home/app && chmod 755 /home/app \
-    && mkdir -p /app/data/convex /app/data/agents /app/data/workflows /app/data/integrations /app/data/providers /app/data/branding /app/data/retention /app/data/skills \
+    && mkdir -p /app/data/convex /app/data/default \
                 /dashboard \
-                /app/builtin/agents /app/builtin/workflows /app/builtin/integrations /app/builtin/providers /app/builtin/branding /app/builtin/retention /app/builtin/skills \
+                /app/builtin/default \
     && chown -R app:app /app/data /dashboard /app/builtin \
     # Strip system bloat from Convex backend base (~155 MB)
     && ARCH_LIB="/usr/lib/$(dpkg --print-architecture | sed 's/amd64/x86_64-linux-gnu/;s/arm64/aarch64-linux-gnu/')" \
@@ -173,13 +173,7 @@ COPY --from=convex-dashboard --chown=app:app /app /dashboard
 # Builtin seed assets (one-time copy on fresh volume; .history/ preserves user edits).
 # Sources come from repo-root examples/ (same as platform Dockerfile).
 # ----------------------------------------------------------------------------
-COPY --chown=app:app examples/agents/       /app/builtin/agents/
-COPY --chown=app:app examples/workflows/    /app/builtin/workflows/
-COPY --chown=app:app examples/integrations/ /app/builtin/integrations/
-COPY --chown=app:app examples/providers/    /app/builtin/providers/
-COPY --chown=app:app examples/branding/     /app/builtin/branding/
-COPY --chown=app:app examples/retention/    /app/builtin/retention/
-COPY --chown=app:app examples/skills/       /app/builtin/skills/
+COPY --chown=app:app examples/default/ /app/builtin/default/
 
 # ----------------------------------------------------------------------------
 # Entrypoint scripts
diff --git a/services/convex/docker-entrypoint.sh b/services/convex/docker-entrypoint.sh
index 50bf34cfd3..49870c14b3 100755
--- a/services/convex/docker-entrypoint.sh
+++ b/services/convex/docker-entrypoint.sh
@@ -39,9 +39,11 @@ log_section() { echo; echo "═════════════════
 # ----------------------------------------------------------------------------
 if [ "$(id -u)" = '0' ]; then
   data_dir="${TALE_CONFIG_DIR:-/app/data}"
-  mkdir -p "$data_dir/convex" "$data_dir/agents" "$data_dir/workflows" \
-           "$data_dir/integrations" "$data_dir/providers" "$data_dir/branding" \
-           "$data_dir/skills"
+  # Org-first layout: per-org subtrees live under `<data_dir>/<orgSlug>/`.
+  # Only create `convex/` (backend storage) and `default/` (the canonical
+  # org seed target) up front; per-domain dirs are created on-demand by
+  # `run_seed` and `scaffoldNewOrganization`.
+  mkdir -p "$data_dir/convex" "$data_dir/default"
   chown -R app:app "$data_dir"
 
   # ----------------------------------------------------------------------------
@@ -268,22 +270,43 @@ if [ -f /etc/yt-dlp-version ]; then
 fi
 
 # ============================================================================
-# Builtin seed (version-marker gated)
+# Builtin seed (version + layout-marker gated) — org-first layout
 # ----------------------------------------------------------------------------
-# Marker: /app/data/.seeded-${TALE_VERSION}
-# - Fresh volume or new version → run 4 seed loops
+# Layout: `<data_dir>/default/<domain>/...` (the canonical default org's
+# subtree); source: `/app/builtin/default/<domain>/...` (org-agnostic
+# template baked into the convex image).
+#
+# Marker: /app/data/.seeded-${TALE_VERSION}-orgfirst
+# - Fresh volume or new version (or pre-orgfirst marker) → run seed loops
 # - Same version restart → skip (already seeded)
-# - FORCE_SEED=true → re-run regardless
+# - FORCE_SEED=true → re-run regardless (overwrites builtin-named files
+#   in place; user-added files at the same dir and `.history/` siblings
+#   survive; encrypted *.secrets.json files are never written)
+#
+# The `-orgfirst` token in the marker name signals the layout transition:
+# an older binary that doesn't recognize this marker re-seeds (idempotently)
+# into its expected old paths on a hypothetical downgrade.
 # ----------------------------------------------------------------------------
-seed_marker="/app/data/.seeded-${TALE_VERSION:-dev}"
+seed_marker="/app/data/.seeded-${TALE_VERSION:-dev}-orgfirst"
 data_dir="/app/data"
 
+# Atomic file copy: write to a sibling tmp file then rename. A SIGKILL
+# between open(dest, O_TRUNC) and the final write would otherwise leave a
+# truncated file at $dest, which the next-run skip-if-exists check treats
+# as "already seeded" — silent corruption. With atomic_cp the next run
+# either sees the original (rename never happened) or the complete file.
+atomic_cp() {
+  local src="$1" dest="$2"
+  local tmp="${dest}.tale-seed.$$.tmp"
+  cp "$src" "$tmp" && mv -f "$tmp" "$dest"
+}
+
 run_seed() {
-  log_section "Seeding builtin configs (TALE_VERSION=${TALE_VERSION:-dev})"
+  log_section "Seeding builtin configs into default org (TALE_VERSION=${TALE_VERSION:-dev})"
 
-  # --- Agents ---
-  local agents_dir="${data_dir}/agents"
-  local agents_builtin="/app/builtin/agents"
+  # --- Agents (flat) ---
+  local agents_dir="${data_dir}/default/agents"
+  local agents_builtin="/app/builtin/default/agents"
   mkdir -p "$agents_dir"
   if [ -d "$agents_builtin" ] && [ "$(ls -A "$agents_builtin" 2>/dev/null)" ]; then
     for src in "$agents_builtin"/*.json; do
@@ -293,20 +316,20 @@ run_seed() {
       local dest="$agents_dir/$name"
       local history_dir="$agents_dir/.history/$slug"
       if [ "$FORCE_SEED" = "true" ]; then
-        cp "$src" "$dest"; echo "   ✓ Seeded $name (forced)"
+        atomic_cp "$src" "$dest"; echo "   ✓ Seeded $name (forced)"
       elif [ -f "$dest" ]; then
         echo "   ⏭ Skipping $name (already exists)"
       elif [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
         echo "   ⏭ Skipping $name (user has modifications in .history)"
       else
-        cp "$src" "$dest"; echo "   ✓ Seeded agent $name"
+        atomic_cp "$src" "$dest"; echo "   ✓ Seeded agent $name"
       fi
     done
   fi
 
-  # --- Workflows (nested paths allowed) ---
-  local workflows_dir="${data_dir}/workflows"
-  local workflows_builtin="/app/builtin/workflows"
+  # --- Workflows (nested folder/name.json) ---
+  local workflows_dir="${data_dir}/default/workflows"
+  local workflows_builtin="/app/builtin/default/workflows"
   mkdir -p "$workflows_dir"
   if [ -d "$workflows_builtin" ] && [ "$(ls -A "$workflows_builtin" 2>/dev/null)" ]; then
     find "$workflows_builtin" -name '*.json' -type f | while read -r src; do
@@ -318,19 +341,19 @@ run_seed() {
       local history_dir="$workflows_dir/.history/$flat_slug"
 
       if [ "$FORCE_SEED" = "true" ]; then
-        mkdir -p "$dest_dir"; cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path (forced)"; continue
+        mkdir -p "$dest_dir"; atomic_cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path (forced)"; continue
       fi
       if [ -f "$dest" ]; then echo "   ⏭ Skipping workflow $rel_path (already exists)"; continue; fi
       if [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
         echo "   ⏭ Skipping workflow $rel_path (user has modifications in .history)"; continue
       fi
-      mkdir -p "$dest_dir"; cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path"
+      mkdir -p "$dest_dir"; atomic_cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path"
     done
   fi
 
-  # --- Integrations (directory-based) ---
-  local integrations_dir="${data_dir}/integrations"
-  local integrations_builtin="/app/builtin/integrations"
+  # --- Integrations (directory bundles) ---
+  local integrations_dir="${data_dir}/default/integrations"
+  local integrations_builtin="/app/builtin/default/integrations"
   mkdir -p "$integrations_dir"
   if [ -d "$integrations_builtin" ] && [ "$(ls -A "$integrations_builtin" 2>/dev/null)" ]; then
     for src_dir in "$integrations_builtin"/*/; do
@@ -350,8 +373,8 @@ run_seed() {
   fi
 
   # --- Skills (directory bundles: SKILL.md + scripts/ + references/ + assets/) ---
-  local skills_dir="${data_dir}/skills"
-  local skills_builtin="/app/builtin/skills"
+  local skills_dir="${data_dir}/default/skills"
+  local skills_builtin="/app/builtin/default/skills"
   mkdir -p "$skills_dir"
   if [ -d "$skills_builtin" ] && [ "$(ls -A "$skills_builtin" 2>/dev/null)" ]; then
     for src_dir in "$skills_builtin"/*/; do
@@ -370,8 +393,8 @@ run_seed() {
   fi
 
   # --- Providers (skip encrypted .secrets.json) ---
-  local providers_dir="${data_dir}/providers"
-  local providers_builtin="/app/builtin/providers"
+  local providers_dir="${data_dir}/default/providers"
+  local providers_builtin="/app/builtin/default/providers"
   mkdir -p "$providers_dir"
   if [ -d "$providers_builtin" ] && [ "$(ls -A "$providers_builtin" 2>/dev/null)" ]; then
     for src in "$providers_builtin"/*.json; do
@@ -382,41 +405,57 @@ run_seed() {
       local dest="$providers_dir/$name"
       local history_dir="$providers_dir/.history/$slug"
       if [ "$FORCE_SEED" = "true" ]; then
-        cp "$src" "$dest"; echo "   ✓ Seeded provider $name (forced)"
+        atomic_cp "$src" "$dest"; echo "   ✓ Seeded provider $name (forced)"
       elif [ -f "$dest" ]; then
         echo "   ⏭ Skipping provider $name (already exists)"
       elif [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
         echo "   ⏭ Skipping provider $name (user has modifications in .history)"
       else
-        cp "$src" "$dest"; echo "   ✓ Seeded provider $name"
+        atomic_cp "$src" "$dest"; echo "   ✓ Seeded provider $name"
       fi
     done
   fi
 
-  # --- Retention (per-org JSON files: $TALE_CONFIG_DIR/retention/{slug}.json) ---
-  # Default org's slug is hardcoded to `default`, so default.json fits
-  # the {orgSlug}.json convention. Retention has no secrets to skip
-  # (compare with providers' .secrets.json branch above).
-  local retention_dir="${data_dir}/retention"
-  local retention_builtin="/app/builtin/retention"
-  mkdir -p "$retention_dir"
-  if [ -d "$retention_builtin" ] && [ "$(ls -A "$retention_builtin" 2>/dev/null)" ]; then
-    for src in "$retention_builtin"/*.json; do
-      [ -f "$src" ] || continue
-      local name="$(basename "$src")"
-      local slug="$(basename "$src" .json)"
-      local dest="$retention_dir/$name"
-      local history_dir="$retention_dir/.history/$slug"
-      if [ "$FORCE_SEED" = "true" ]; then
-        cp "$src" "$dest"; echo "   ✓ Seeded retention $name (forced)"
-      elif [ -f "$dest" ]; then
-        echo "   ⏭ Skipping retention $name (already exists)"
-      elif [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
-        echo "   ⏭ Skipping retention $name (user has modifications in .history)"
-      else
-        cp "$src" "$dest"; echo "   ✓ Seeded retention $name"
-      fi
-    done
+  # --- Branding (single file at default/branding/branding.json) ---
+  # Closes a long-standing gap: previously branding was only seeded by the
+  # Convex scaffold action for new orgs, never on the default-org bootstrap
+  # path. With org-first the default org needs the same treatment as any
+  # other org for consistency (uniform model).
+  local branding_dir="${data_dir}/default/branding"
+  local branding_src="/app/builtin/default/branding/branding.json"
+  mkdir -p "$branding_dir"
+  if [ -f "$branding_src" ]; then
+    local dest="$branding_dir/branding.json"
+    local history_dir="$branding_dir/.history/branding"
+    if [ "$FORCE_SEED" = "true" ]; then
+      atomic_cp "$branding_src" "$dest"; echo "   ✓ Seeded branding (forced)"
+    elif [ -f "$dest" ]; then
+      echo "   ⏭ Skipping branding (already exists)"
+    elif [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
+      echo "   ⏭ Skipping branding (user has modifications in .history)"
+    else
+      atomic_cp "$branding_src" "$dest"; echo "   ✓ Seeded branding"
+    fi
+  fi
+
+  # --- Retention (single file at default/retention.json) ---
+  # Retention is one JSON object per org under the uniform org-first layout
+  # (`$TALE_CONFIG_DIR/<orgSlug>/retention.json`). The catalog ships only
+  # the default org's retention config; non-default orgs are seeded by the
+  # Convex scaffold action.
+  local retention_src="/app/builtin/default/retention.json"
+  if [ -f "$retention_src" ]; then
+    local dest="${data_dir}/default/retention.json"
+    local history_dir="${data_dir}/default/.history/retention"
+    if [ "$FORCE_SEED" = "true" ]; then
+      atomic_cp "$retention_src" "$dest"; echo "   ✓ Seeded retention (forced)"
+    elif [ -f "$dest" ]; then
+      echo "   ⏭ Skipping retention (already exists)"
+    elif [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
+      echo "   ⏭ Skipping retention (user has modifications in .history)"
+    else
+      atomic_cp "$retention_src" "$dest"; echo "   ✓ Seeded retention"
+    fi
   fi
 
   touch "$seed_marker"
diff --git a/services/platform/app/features/settings/integrations/components/integration-upload/constants/integration-templates.ts b/services/platform/app/features/settings/integrations/components/integration-upload/constants/integration-templates.ts
index 7260c87284..07854be8c2 100644
--- a/services/platform/app/features/settings/integrations/components/integration-upload/constants/integration-templates.ts
+++ b/services/platform/app/features/settings/integrations/components/integration-upload/constants/integration-templates.ts
@@ -1,7 +1,7 @@
 // Points to `main` so templates stay current with the shipped app.
 // If templates require immutable pinning, replace 'main' with a release tag.
 const TEMPLATES_REF = 'main';
-const GITHUB_RAW_BASE = `https://raw.githubusercontent.com/tale-project/tale/${TEMPLATES_REF}/examples/integrations`;
+const GITHUB_RAW_BASE = `https://raw.githubusercontent.com/tale-project/tale/${TEMPLATES_REF}/examples/default/integrations`;
 
 export interface IntegrationTemplate {
   name: string;
diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts
index 791f5f711a..6b557cb4f2 100644
--- a/services/platform/convex/_generated/api.d.ts
+++ b/services/platform/convex/_generated/api.d.ts
@@ -636,6 +636,7 @@ import type * as organizations_helpers from "../organizations/helpers.js";
 import type * as organizations_internal_queries from "../organizations/internal_queries.js";
 import type * as organizations_queries from "../organizations/queries.js";
 import type * as organizations_record_org_switch from "../organizations/record_org_switch.js";
+import type * as organizations_reseed_all_orgs from "../organizations/reseed_all_orgs.js";
 import type * as organizations_resolve_org_slug from "../organizations/resolve_org_slug.js";
 import type * as organizations_scaffold from "../organizations/scaffold.js";
 import type * as organizations_update_organization from "../organizations/update_organization.js";
@@ -1732,6 +1733,7 @@ declare const fullApi: ApiFromModules<{
   "organizations/internal_queries": typeof organizations_internal_queries;
   "organizations/queries": typeof organizations_queries;
   "organizations/record_org_switch": typeof organizations_record_org_switch;
+  "organizations/reseed_all_orgs": typeof organizations_reseed_all_orgs;
   "organizations/resolve_org_slug": typeof organizations_resolve_org_slug;
   "organizations/scaffold": typeof organizations_scaffold;
   "organizations/update_organization": typeof organizations_update_organization;
diff --git a/services/platform/convex/agents/file_utils.ts b/services/platform/convex/agents/file_utils.ts
index bdcefc3121..dbe0cfe7f7 100644
--- a/services/platform/convex/agents/file_utils.ts
+++ b/services/platform/convex/agents/file_utils.ts
@@ -125,14 +125,12 @@ export function parseAgentJson(content: string): AgentJsonConfig {
   return result.data;
 }
 
-function getBaseDir(): string {
-  const dir = process.env.AGENTS_DIR;
-  if (dir) return dir;
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'agents');
+  if (configDir) return configDir;
   throw new Error(
-    'Neither TALE_CONFIG_DIR nor AGENTS_DIR environment variable is set. ' +
-      'Set TALE_CONFIG_DIR in .env to the root config directory ' +
+    'TALE_CONFIG_DIR environment variable is not set. ' +
+      'Set it to the root config directory ' +
       '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
   );
 }
@@ -141,11 +139,7 @@ export function resolveAgentsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') {
-    return baseDir;
-  }
-  return path.join(baseDir, orgSlug);
+  return path.join(getConfigRoot(), orgSlug, 'agents');
 }
 
 export function resolveAgentFilePath(
diff --git a/services/platform/convex/branding/file_actions.ts b/services/platform/convex/branding/file_actions.ts
index d58ecf120d..a10f5369be 100644
--- a/services/platform/convex/branding/file_actions.ts
+++ b/services/platform/convex/branding/file_actions.ts
@@ -3,9 +3,13 @@
 /**
  * Branding file I/O actions.
  *
- * Branding is global (not org-scoped). A single branding.json file
- * at {TALE_CONFIG_DIR}/branding/branding.json applies to the entire platform.
- * Images (logo, favicons) are stored on disk at {TALE_CONFIG_DIR}/branding/images/.
+ * Branding is global (not org-scoped). A single branding.json file at
+ * {TALE_CONFIG_DIR}/default/branding/branding.json applies to the entire
+ * platform. Images (logo, favicons) are stored on disk at
+ * {TALE_CONFIG_DIR}/default/branding/images/. Although on-disk files live
+ * under the `default` org subtree like every other domain, the read-side
+ * here hardcodes `'default'` — non-default orgs do not have separate
+ * branding today.
  *
  * Uses atomic writes (temp → fsync → rename) for data safety.
  * History snapshots use epoch-ms filenames with 10-entry retention.
diff --git a/services/platform/convex/branding/file_utils.ts b/services/platform/convex/branding/file_utils.ts
index 2cf7801298..f9ce6d5266 100644
--- a/services/platform/convex/branding/file_utils.ts
+++ b/services/platform/convex/branding/file_utils.ts
@@ -36,25 +36,27 @@ export type BrandingReadResult =
       message: string;
     };
 
-function getBaseDir(): string {
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'branding');
+  if (configDir) return configDir;
   throw new Error(
     'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set TALE_CONFIG_DIR in .env to the root config directory ' +
+      'Set it to the root config directory ' +
       '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
   );
 }
 
+/**
+ * Resolve the branding directory for an organization. Org-first:
+ * `${TALE_CONFIG_DIR}/<orgSlug>/branding/`. Read-side currently hardcodes
+ * `'default'` (see branding/file_actions.ts call sites), so non-default
+ * org branding dirs are scaffolded but unread.
+ */
 export function resolveBrandingDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') {
-    return baseDir;
-  }
-  return path.join(baseDir, orgSlug);
+  return path.join(getConfigRoot(), orgSlug, 'branding');
 }
 
 export function resolveBrandingFilePath(orgSlug: string): string {
diff --git a/services/platform/convex/branding/queries.test.ts b/services/platform/convex/branding/queries.test.ts
index 761d353934..83013c0220 100644
--- a/services/platform/convex/branding/queries.test.ts
+++ b/services/platform/convex/branding/queries.test.ts
@@ -74,13 +74,15 @@ describe('serializeBrandingJson', () => {
   });
 });
 
-describe('resolveBrandingDir', () => {
-  it('returns base dir for default org', () => {
-    expect(resolveBrandingDir('default')).toBe('/tmp/test-data/branding');
+describe('resolveBrandingDir (org-first)', () => {
+  it('default org lives at <root>/default/branding/', () => {
+    expect(resolveBrandingDir('default')).toBe(
+      '/tmp/test-data/default/branding',
+    );
   });
 
-  it('returns subdirectory for named org', () => {
-    expect(resolveBrandingDir('acme')).toBe('/tmp/test-data/branding/acme');
+  it('other orgs live at <root>/<orgSlug>/branding/ (read-side is default-only today)', () => {
+    expect(resolveBrandingDir('acme')).toBe('/tmp/test-data/acme/branding');
   });
 
   it('throws for invalid org slug', () => {
@@ -89,9 +91,9 @@ describe('resolveBrandingDir', () => {
 });
 
 describe('resolveBrandingFilePath', () => {
-  it('returns branding.json path', () => {
+  it('returns branding.json path under <org>/branding/', () => {
     expect(resolveBrandingFilePath('default')).toBe(
-      '/tmp/test-data/branding/branding.json',
+      '/tmp/test-data/default/branding/branding.json',
     );
   });
 });
@@ -140,15 +142,17 @@ describe('mimeToExtension', () => {
 });
 
 describe('resolveImagesDir', () => {
-  it('returns images subdirectory', () => {
-    expect(resolveImagesDir('default')).toBe('/tmp/test-data/branding/images');
+  it('returns images subdirectory under <org>/branding/', () => {
+    expect(resolveImagesDir('default')).toBe(
+      '/tmp/test-data/default/branding/images',
+    );
   });
 });
 
 describe('resolveImagePath', () => {
   it('resolves valid image filename', () => {
     expect(resolveImagePath('default', 'logo.png')).toBe(
-      '/tmp/test-data/branding/images/logo.png',
+      '/tmp/test-data/default/branding/images/logo.png',
     );
   });
 
diff --git a/services/platform/convex/governance/retention_actions.ts b/services/platform/convex/governance/retention_actions.ts
index 4a5320ceb4..c86ef5d146 100644
--- a/services/platform/convex/governance/retention_actions.ts
+++ b/services/platform/convex/governance/retention_actions.ts
@@ -119,7 +119,7 @@ export const getRetentionBoundsAction = action({
       throw new ConvexError({
         code: 'RETENTION_CONFIG_MISSING',
         message:
-          'Retention config not yet installed. Copy examples/retention/default.json to $TALE_CONFIG_DIR/retention/default.json then reload.',
+          'Retention config not yet installed. Copy examples/default/retention.json to $TALE_CONFIG_DIR/default/retention.json then reload.',
       });
     }
 
@@ -189,7 +189,7 @@ export const upsertRetentionPolicyAction = action({
       throw new ConvexError({
         code: 'RETENTION_CONFIG_MISSING',
         message:
-          'Retention config not yet installed. Copy examples/retention/default.json to $TALE_CONFIG_DIR/retention/default.json.',
+          'Retention config not yet installed. Copy examples/default/retention.json to $TALE_CONFIG_DIR/default/retention.json.',
       });
     }
     const boundsByCategory = buildBoundsByCategory(orgConfig);
diff --git a/services/platform/convex/governance/retention_bounds_proposal.ts b/services/platform/convex/governance/retention_bounds_proposal.ts
index 3299c13a41..b4975f2eee 100644
--- a/services/platform/convex/governance/retention_bounds_proposal.ts
+++ b/services/platform/convex/governance/retention_bounds_proposal.ts
@@ -78,7 +78,7 @@ async function computeEffectiveAppliedBounds(
     throw new ConvexError({
       code: 'RETENTION_CONFIG_MISSING',
       message:
-        'Retention config not yet installed. Copy examples/retention/default.json to $TALE_CONFIG_DIR/retention/default.json then reload.',
+        'Retention config not yet installed. Copy examples/default/retention.json to $TALE_CONFIG_DIR/default/retention.json then reload.',
     });
   }
   const all = applyEnvTighteningAll(orgConfig);
diff --git a/services/platform/convex/governance/retention_floors.ts b/services/platform/convex/governance/retention_floors.ts
index d2295d0e69..f34802c0c8 100644
--- a/services/platform/convex/governance/retention_floors.ts
+++ b/services/platform/convex/governance/retention_floors.ts
@@ -316,7 +316,7 @@ export class RetentionConfigMissingError extends Error {
   readonly hint: string;
   constructor(category: RetentionCategory) {
     const hint =
-      'Copy examples/retention/default.json to $TALE_CONFIG_DIR/retention/default.json';
+      'Copy examples/default/retention.json to $TALE_CONFIG_DIR/default/retention.json';
     super(`Retention config missing for category=${category}. ${hint}`);
     this.category = category;
     this.hint = hint;
diff --git a/services/platform/convex/integrations/file_utils.ts b/services/platform/convex/integrations/file_utils.ts
index 3d1ec52ff3..e9fde32f0e 100644
--- a/services/platform/convex/integrations/file_utils.ts
+++ b/services/platform/convex/integrations/file_utils.ts
@@ -19,7 +19,7 @@ export { sha256 };
 
 /**
  * Integration slug: lowercase alphanumeric + hyphens/underscores, flat (no nesting).
- * Must match the directory name under INTEGRATIONS_DIR.
+ * Must match the directory name under `${TALE_CONFIG_DIR}/<orgSlug>/integrations/`.
  */
 const INTEGRATION_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
 
@@ -44,32 +44,25 @@ export function validateIntegrationSlug(slug: string): boolean {
   return INTEGRATION_SLUG_REGEX.test(slug);
 }
 
-function getBaseDir(): string {
-  const dir = process.env.INTEGRATIONS_DIR;
-  if (dir) return dir;
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'integrations');
+  if (configDir) return configDir;
   throw new Error(
-    'Neither TALE_CONFIG_DIR nor INTEGRATIONS_DIR environment variable is set. ' +
-      'Set TALE_CONFIG_DIR in .env to the root config directory ' +
+    'TALE_CONFIG_DIR environment variable is not set. ' +
+      'Set it to the root config directory ' +
       '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
   );
 }
 
 /**
- * Resolve the integrations directory for an organization.
- * Default org uses the base dir directly.
- * Other orgs use `{baseDir}/@{orgSlug}/`.
+ * Resolve the integrations directory for an organization. Org-first:
+ * `${TALE_CONFIG_DIR}/<orgSlug>/integrations/`.
  */
 export function resolveIntegrationsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') {
-    return baseDir;
-  }
-  return path.join(baseDir, `@${orgSlug}`);
+  return path.join(getConfigRoot(), orgSlug, 'integrations');
 }
 
 /**
diff --git a/services/platform/convex/lib/config_store/actions.ts b/services/platform/convex/lib/config_store/actions.ts
index 91dc9a3582..24adaafa1f 100644
--- a/services/platform/convex/lib/config_store/actions.ts
+++ b/services/platform/convex/lib/config_store/actions.ts
@@ -19,9 +19,12 @@ import { retentionDefaultsConfigSchema } from '../../../lib/shared/schemas/reten
 import { internalAction } from '../../_generated/server';
 import { createFileConfigStore } from './store';
 
+// Retention is one JSON object per org under the uniform org-first layout:
+// `$TALE_CONFIG_DIR/<orgSlug>/retention.json`.
 const retentionStore = createFileConfigStore<RetentionDefaultsConfig>(
   'retention',
   retentionDefaultsConfigSchema,
+  { orgFirst: true },
 );
 
 export const readRetentionConfig = internalAction({
diff --git a/services/platform/convex/lib/config_store/store.ts b/services/platform/convex/lib/config_store/store.ts
index 3231a3bc4e..29b0af8294 100644
--- a/services/platform/convex/lib/config_store/store.ts
+++ b/services/platform/convex/lib/config_store/store.ts
@@ -1,15 +1,18 @@
 'use node';
 
 /**
- * Generic typed read/write helper for area-specific JSON config files
- * under `$TALE_CONFIG_DIR/{area}/{orgSlug}.json`.
+ * Generic typed read/write helper for area-specific JSON config files.
  *
- * The area-agnostic substrate behind retention's per-org files. Wrapping
- * `readJsonFile` + `atomicWrite` so callers don't reinvent path
- * resolution, symlink/size guards, or atomic-rename semantics.
+ * Two layout shapes are supported, selected via `orgFirst`:
+ *
+ * - `orgFirst: false` (default): `$TALE_CONFIG_DIR/{area}/{orgSlug}.json`.
+ *   The legacy per-area-dir shape; org slugs live in the filename.
+ * - `orgFirst: true`: `$TALE_CONFIG_DIR/{orgSlug}/{area}.json`.
+ *   Used by retention under the uniform org-first layout — each org has
+ *   one file per area, alongside its `agents/`, `providers/`, etc.
  *
- * Initially used only by retention; provider/integration migrations are
- * the obvious next consumers. Keep the API minimal.
+ * Wraps `readJsonFile` + `atomicWrite` so callers don't reinvent path
+ * resolution, symlink/size guards, or atomic-rename semantics.
  *
  * Known limitations (round-2 / M7):
  *   - **Last-writer-wins.** No file-level locking — two concurrent
@@ -27,7 +30,7 @@
  *     wired into a UI flow.
  */
 
-import { readdir } from 'node:fs/promises';
+import { readdir, stat } from 'node:fs/promises';
 import path from 'node:path';
 
 import type { z } from 'zod/v4';
@@ -46,29 +49,45 @@ export interface ConfigStore<T> {
   read(orgSlug: string): Promise<T | null>;
   /** Atomic write of the parsed/serialized config to the per-org path. */
   write(orgSlug: string, value: T): Promise<void>;
-  /** Enumerate `*.json` files in the area dir, returning each org slug. */
+  /** Enumerate orgs that have a file for this area. */
   list(): Promise<Array<{ orgSlug: string }>>;
 }
 
-function getAreaDir(area: string): string {
+export interface CreateFileConfigStoreOptions {
+  /**
+   * When true, paths follow the org-first layout:
+   * `$TALE_CONFIG_DIR/<orgSlug>/<area>.json`. List enumerates per-org
+   * directories that contain `<area>.json`. When false (default), paths
+   * follow `$TALE_CONFIG_DIR/<area>/<orgSlug>.json`.
+   */
+  orgFirst?: boolean;
+}
+
+function getConfigRoot(area: string): string {
   const configDir = process.env.TALE_CONFIG_DIR;
   if (!configDir) {
     throw new Error(
       `TALE_CONFIG_DIR environment variable is not set. ` +
         `Set TALE_CONFIG_DIR in .env to the root config directory ` +
-        `(e.g., TALE_CONFIG_DIR=/path/to/tale/examples) so ${area}/ ` +
+        `(e.g., TALE_CONFIG_DIR=/path/to/tale/examples) so ${area} ` +
         `can be resolved.`,
     );
   }
-  return path.join(configDir, area);
+  return configDir;
 }
 
-function resolveFilePath(area: string, orgSlug: string): string {
+function resolveFilePath(
+  area: string,
+  orgSlug: string,
+  orgFirst: boolean,
+): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const dir = getAreaDir(area);
-  const resolved = path.resolve(dir, `${orgSlug}.json`);
+  const root = getConfigRoot(area);
+  const dir = orgFirst ? path.join(root, orgSlug) : path.join(root, area);
+  const fileName = orgFirst ? `${area}.json` : `${orgSlug}.json`;
+  const resolved = path.resolve(dir, fileName);
   const expectedPrefix = path.resolve(dir);
   if (
     !resolved.startsWith(expectedPrefix + path.sep) &&
@@ -87,7 +106,10 @@ function resolveFilePath(area: string, orgSlug: string): string {
 export function createFileConfigStore<T>(
   area: string,
   schema: z.ZodType<T>,
+  options: CreateFileConfigStoreOptions = {},
 ): ConfigStore<T> {
+  const orgFirst = options.orgFirst ?? false;
+
   const parse = (content: string): T => {
     // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- raw JSON before Zod validation
     const parsed = JSON.parse(content) as unknown;
@@ -100,16 +122,17 @@ export function createFileConfigStore<T>(
 
   return {
     async read(orgSlug) {
-      const filePath = resolveFilePath(area, orgSlug);
+      const filePath = resolveFilePath(area, orgSlug, orgFirst);
       const result = await readJsonFile(filePath, MAX_FILE_SIZE_BYTES, parse);
       if (result.ok) return result.data;
       if (result.error === 'not_found') return null;
-      throw new Error(
-        `Failed to read ${area}/${orgSlug}.json: ${result.message}`,
-      );
+      const display = orgFirst
+        ? `${orgSlug}/${area}.json`
+        : `${area}/${orgSlug}.json`;
+      throw new Error(`Failed to read ${display}: ${result.message}`);
     },
     async write(orgSlug, value) {
-      const filePath = resolveFilePath(area, orgSlug);
+      const filePath = resolveFilePath(area, orgSlug, orgFirst);
       // Re-parse before write to surface schema errors to the caller
       // rather than silently corrupting the file. Cheap relative to fs.
       const parsed = schema.safeParse(value);
@@ -122,12 +145,37 @@ export function createFileConfigStore<T>(
       await atomicWrite(filePath, content);
     },
     async list() {
-      const dir = getAreaDir(area);
+      const root = getConfigRoot(area);
+      if (orgFirst) {
+        // Each org's file lives at `<root>/<orgSlug>/<area>.json`.
+        // Enumerate org subdirs (validated by slug regex) and probe each
+        // for the area file. Missing root → return empty rather than
+        // throwing — operator hasn't seeded anything yet.
+        let entries: string[];
+        try {
+          entries = await readdir(root);
+        } catch (err) {
+          if (err instanceof Error && 'code' in err && err.code === 'ENOENT') {
+            return [];
+          }
+          throw err;
+        }
+        const results: Array<{ orgSlug: string }> = [];
+        for (const name of entries) {
+          if (!validateOrgSlug(name)) continue;
+          const filePath = path.join(root, name, `${area}.json`);
+          const info = await stat(filePath).catch(() => null);
+          if (info?.isFile()) results.push({ orgSlug: name });
+        }
+        return results;
+      }
+
+      // Legacy per-area-dir layout: list `*.json` files under `<root>/<area>/`.
+      const dir = path.join(root, area);
       let entries: string[];
       try {
         entries = await readdir(dir);
       } catch (err) {
-        // Missing dir is fine — operator hasn't seeded anything yet.
         if (err instanceof Error && 'code' in err && err.code === 'ENOENT') {
           return [];
         }
diff --git a/services/platform/convex/node_only/integration_sandbox/gmail_draft_filtering.test.ts b/services/platform/convex/node_only/integration_sandbox/gmail_draft_filtering.test.ts
index edfe0197e3..d1ee8f8c7a 100644
--- a/services/platform/convex/node_only/integration_sandbox/gmail_draft_filtering.test.ts
+++ b/services/platform/convex/node_only/integration_sandbox/gmail_draft_filtering.test.ts
@@ -9,7 +9,7 @@ import { executeIntegrationImpl } from './execute_integration_impl';
 const connectorTs = fs.readFileSync(
   path.resolve(
     __dirname,
-    '../../../../../examples/integrations/gmail/connector.ts',
+    '../../../../../examples/default/integrations/gmail/connector.ts',
   ),
   'utf-8',
 );
diff --git a/services/platform/convex/node_only/integration_sandbox/outlook_draft_filtering.test.ts b/services/platform/convex/node_only/integration_sandbox/outlook_draft_filtering.test.ts
index 2eb9db1211..d9be1a0bc8 100644
--- a/services/platform/convex/node_only/integration_sandbox/outlook_draft_filtering.test.ts
+++ b/services/platform/convex/node_only/integration_sandbox/outlook_draft_filtering.test.ts
@@ -9,7 +9,7 @@ import { executeIntegrationImpl } from './execute_integration_impl';
 const connectorTs = fs.readFileSync(
   path.resolve(
     __dirname,
-    '../../../../../examples/integrations/outlook/connector.ts',
+    '../../../../../examples/default/integrations/outlook/connector.ts',
   ),
   'utf-8',
 );
diff --git a/services/platform/convex/organizations/reseed_all_orgs.ts b/services/platform/convex/organizations/reseed_all_orgs.ts
new file mode 100644
index 0000000000..4bb196cf58
--- /dev/null
+++ b/services/platform/convex/organizations/reseed_all_orgs.ts
@@ -0,0 +1,106 @@
+/**
+ * Operator-triggered re-seed: enumerate every org (incl. `default`) and
+ * re-invoke `scaffoldNewOrganization` with `override:true`. Driven by
+ * `tale deploy --override-all` via `bunx convex run organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin`.
+ *
+ * Semantics:
+ *   - Always reseeds `default` even if absent from the org list (canonical
+ *     template org).
+ *   - Per-org try/catch: one failure logs + continues; the full result
+ *     map is returned so the CLI surfaces succeeded/failed counts and
+ *     exits non-zero on any failure.
+ *   - Deterministic order: collected slugs are sorted before processing
+ *     so logs and partial-failure reruns are reproducible.
+ *   - Cursor-paginated org enumeration (200/page) instead of the
+ *     500-page-cap pattern in older backfills — avoids silently capping
+ *     deployments with many orgs.
+ *
+ * Note: this is an ops re-runnable tool, not a one-shot migration. Lives
+ * next to `scaffold.ts` (the thing it reinvokes), not in `migrations/`.
+ */
+
+import { getString, isRecord } from '../../lib/utils/type-guards';
+import { components, internal } from '../_generated/api';
+import { internalAction } from '../_generated/server';
+
+// Inlined to avoid importing from convex/lib/file_io.ts (which has 'use node'
+// and would force this orchestration action into the Node runtime). Keep in
+// sync with `validateOrgSlug` at services/platform/convex/lib/file_io.ts.
+const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+function isValidOrgSlug(slug: string): boolean {
+  return slug === 'default' || ORG_SLUG_REGEX.test(slug);
+}
+
+type OrgReseedResult =
+  | { slug: string; status: 'ok' }
+  | { slug: string; status: 'error'; error: string };
+
+export const reseedAllOrgsFromBuiltin = internalAction({
+  args: {},
+  handler: async (ctx) => {
+    const slugSet = new Set<string>(['default']);
+
+    let cursor: string | null = null;
+    let isDone = false;
+    while (!isDone) {
+      const res: unknown = await ctx.runQuery(
+        components.betterAuth.adapter.findMany,
+        {
+          model: 'organization',
+          paginationOpts: { cursor, numItems: 200 },
+          where: [],
+        },
+      );
+      const page = isRecord(res) && Array.isArray(res.page) ? res.page : [];
+      for (const raw of page) {
+        if (!isRecord(raw)) continue;
+        const slug = getString(raw, 'slug');
+        if (!slug) continue;
+        if (!isValidOrgSlug(slug)) {
+          console.warn(
+            `[reseedAllOrgs] skipping invalid slug "${slug}" returned by betterAuth`,
+          );
+          continue;
+        }
+        slugSet.add(slug);
+      }
+      cursor =
+        isRecord(res) && typeof res.continueCursor === 'string'
+          ? res.continueCursor
+          : null;
+      isDone =
+        isRecord(res) && typeof res.isDone === 'boolean' ? res.isDone : true;
+    }
+
+    const slugs = Array.from(slugSet).sort();
+    const results: OrgReseedResult[] = [];
+
+    for (const slug of slugs) {
+      try {
+        await ctx.runAction(
+          internal.organizations.scaffold.scaffoldNewOrganization,
+          { orgSlug: slug, override: true },
+        );
+        results.push({ slug, status: 'ok' });
+        console.log(`[reseedAllOrgs] reseeded "${slug}"`);
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        console.error(`[reseedAllOrgs] "${slug}" failed:`, message);
+        results.push({ slug, status: 'error', error: message });
+      }
+    }
+
+    const succeeded = results.filter((r) => r.status === 'ok').length;
+    const failed = results.length - succeeded;
+    console.log(
+      `[reseedAllOrgs] done: total=${results.length} succeeded=${succeeded} failed=${failed}`,
+    );
+
+    return {
+      total: results.length,
+      succeeded,
+      failed,
+      results,
+    };
+  },
+});
diff --git a/services/platform/convex/organizations/scaffold.test.ts b/services/platform/convex/organizations/scaffold.test.ts
index 4d997a2d44..5392eac0a3 100644
--- a/services/platform/convex/organizations/scaffold.test.ts
+++ b/services/platform/convex/organizations/scaffold.test.ts
@@ -20,17 +20,24 @@ vi.mock('../_generated/server', () => ({
   internalAction: vi.fn((config) => config),
 }));
 
-const { scaffoldNewOrganization } = await import('./scaffold');
+const { scaffoldNewOrganization, cleanupOrgFilesystem } =
+  await import('./scaffold');
 
 type ActionConfig = {
-  handler: (ctx: never, args: { orgSlug: string }) => Promise<unknown>;
+  handler: (
+    ctx: never,
+    args: { orgSlug: string; override?: boolean },
+  ) => Promise<unknown>;
 };
 const scaffoldHandler = (scaffoldNewOrganization as unknown as ActionConfig)
   .handler;
+const cleanupHandler = (cleanupOrgFilesystem as unknown as ActionConfig)
+  .handler;
 
-// All env vars the scaffold code path or the per-domain resolvers consult.
-// Save + clear them in beforeEach so each test starts from a known-empty
-// state, then restore in afterEach so we don't poison other test files.
+// Under org-first only TALE_CONFIG_DIR + TALE_CONFIG_BUILTIN_DIR remain;
+// per-domain env overrides (AGENTS_DIR / WORKFLOWS_DIR / PROVIDERS_DIR /
+// INTEGRATIONS_DIR / SKILLS_DIR) were dropped. Still save/restore the
+// legacy keys defensively so a stale shell-env value can't leak across.
 const ENV_KEYS = [
   'TALE_CONFIG_DIR',
   'TALE_CONFIG_BUILTIN_DIR',
@@ -72,227 +79,144 @@ async function writeText(filePath: string, content: string): Promise<void> {
   await writeFile(filePath, content, 'utf-8');
 }
 
-describe('scaffoldNewOrganization', () => {
-  it('seeds workflows from the catalog and ignores the default org workspace', async () => {
-    // Catalog: a shipped template under workflows/shopify/sync.json.
+// Catalog source path for a given domain — mirrors the org-first builtin
+// layout (`<catalogRoot>/default/<domain>/...`) the scaffold reads from.
+function catSrc(...parts: string[]): string {
+  return path.join(catalogRoot, 'default', ...parts);
+}
+
+// Per-org target path — `<configRoot>/<orgSlug>/<domain>/...`.
+function orgDst(orgSlug: string, ...parts: string[]): string {
+  return path.join(configRoot, orgSlug, ...parts);
+}
+
+describe('scaffoldNewOrganization (org-first)', () => {
+  it('seeds workflows from the catalog into the org-first target', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
     await writeText(
-      path.join(catalogRoot, 'workflows', 'shopify', 'sync.json'),
+      catSrc('workflows', 'shopify', 'sync.json'),
       '{"name":"sync"}',
     );
 
-    // Default-org workspace: a junk workflow that must NOT propagate.
-    await writeText(
-      path.join(configRoot, 'workflows', 'junk.json'),
-      '{"name":"junk"}',
-    );
-
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    const acmeDir = path.join(configRoot, 'workflows', '@acme');
-    expect(existsSync(path.join(acmeDir, 'shopify', 'sync.json'))).toBe(true);
-    expect(existsSync(path.join(acmeDir, 'junk.json'))).toBe(false);
+    expect(
+      existsSync(orgDst('acme', 'workflows', 'shopify', 'sync.json')),
+    ).toBe(true);
   });
 
-  it('closes the agents cross-tenant leak: raw-slug subdirs in the source are not copied', async () => {
-    // Agents catalog contains only the shipped template.
+  it('seeds flat domains (agents) per-file from the catalog', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
-    await writeText(
-      path.join(catalogRoot, 'agents', 'shipped.json'),
-      '{"displayName":"shipped"}',
-    );
-
-    // Default-org workspace contains another tenant's raw-slug subdir.
-    // Pre-fix scaffolding (which sourced from this dir) would recursively
-    // copy `competitor/` into the new org because the @-skip in copyTree
-    // doesn't catch raw slugs. Sourcing from the catalog instead must
-    // not see this at all.
-    await writeText(
-      path.join(configRoot, 'agents', 'competitor', 'secret.json'),
-      '{"displayName":"leak"}',
-    );
+    await writeText(catSrc('agents', 'shipped.json'), '{"displayName":"x"}');
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    const acmeDir = path.join(configRoot, 'agents', 'acme');
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(true);
-    expect(existsSync(path.join(acmeDir, 'competitor'))).toBe(false);
-    expect(existsSync(path.join(acmeDir, 'competitor', 'secret.json'))).toBe(
-      false,
-    );
+    expect(existsSync(orgDst('acme', 'agents', 'shipped.json'))).toBe(true);
   });
 
-  it('flat domains (agents/providers) never recurse into catalog subdirs', async () => {
+  it('flat domains never recurse into catalog subdirs (defense if the catalog ever ships one)', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
-    await writeText(
-      path.join(catalogRoot, 'agents', 'shipped.json'),
-      '{"displayName":"shipped"}',
-    );
+    await writeText(catSrc('agents', 'shipped.json'), '{"displayName":"x"}');
     // A subdir inside the agents catalog is unexpected (agents is file-only).
-    // The flat-domain guard must skip it rather than recurse.
     await writeText(
-      path.join(catalogRoot, 'agents', 'stray', 'nested.json'),
+      catSrc('agents', 'stray', 'nested.json'),
       '{"displayName":"nested"}',
     );
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    const acmeDir = path.join(configRoot, 'agents', 'acme');
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(true);
-    expect(existsSync(path.join(acmeDir, 'stray'))).toBe(false);
+    expect(existsSync(orgDst('acme', 'agents', 'shipped.json'))).toBe(true);
+    expect(existsSync(orgDst('acme', 'agents', 'stray'))).toBe(false);
   });
 
-  it('flat-domain guard closes the agents leak on the dev fallback path (env unset)', async () => {
-    // No catalog env → source is the default-org workspace. A previously
-    // created org left a raw-slug subdir there; scaffolding a new org must
-    // not recurse into it. Here the flat-domain guard — not the source
-    // choice — is what prevents the cross-tenant copy.
-    await writeText(
-      path.join(configRoot, 'agents', 'shipped.json'),
-      '{"displayName":"shipped"}',
-    );
-    await writeText(
-      path.join(configRoot, 'agents', 'competitor', 'secret.json'),
-      '{"displayName":"leak"}',
-    );
-
-    await scaffoldHandler({} as never, { orgSlug: 'acme' });
-
-    const acmeDir = path.join(configRoot, 'agents', 'acme');
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(true);
-    expect(existsSync(path.join(acmeDir, 'competitor'))).toBe(false);
-  });
-
-  it('skips symlinks rather than following them', async () => {
+  it('skips symlinks in the catalog rather than following them', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
-    const targetPayload = await mkdtemp(path.join(tmpdir(), 'scaffold-evil-'));
-    const targetFile = path.join(targetPayload, 'payload.json');
-    await writeFile(targetFile, '{"name":"escaped"}', 'utf-8');
+    const evilPayloadDir = await mkdtemp(path.join(tmpdir(), 'scaffold-evil-'));
+    const evilFile = path.join(evilPayloadDir, 'payload.json');
+    await writeFile(evilFile, '{"name":"escaped"}', 'utf-8');
 
-    await mkdir(path.join(catalogRoot, 'workflows'), { recursive: true });
-    await symlink(targetFile, path.join(catalogRoot, 'workflows', 'evil.json'));
-    // Also drop a real file beside it so we know the copy loop kept running.
-    await writeText(
-      path.join(catalogRoot, 'workflows', 'legit.json'),
-      '{"name":"legit"}',
-    );
+    await mkdir(catSrc('workflows'), { recursive: true });
+    await symlink(evilFile, path.join(catSrc('workflows'), 'evil.json'));
+    await writeText(catSrc('workflows', 'legit.json'), '{"name":"legit"}');
 
     try {
       await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-      const acmeDir = path.join(configRoot, 'workflows', '@acme');
-      expect(existsSync(path.join(acmeDir, 'evil.json'))).toBe(false);
-      expect(existsSync(path.join(acmeDir, 'legit.json'))).toBe(true);
+      expect(existsSync(orgDst('acme', 'workflows', 'evil.json'))).toBe(false);
+      expect(existsSync(orgDst('acme', 'workflows', 'legit.json'))).toBe(true);
     } finally {
-      await rm(targetPayload, { recursive: true, force: true });
+      await rm(evilPayloadDir, { recursive: true, force: true });
     }
   });
 
-  it('falls back to domain.resolve(default) when the catalog env is unset (dev)', async () => {
-    // No TALE_CONFIG_BUILTIN_DIR set. Default-org workspace becomes the
-    // catalog — historical behavior, preserved for local dev.
-    await writeText(
-      path.join(configRoot, 'workflows', 'shopify', 'sync.json'),
-      '{"name":"sync"}',
-    );
-
-    await scaffoldHandler({} as never, { orgSlug: 'acme' });
-
-    const acmeDir = path.join(configRoot, 'workflows', '@acme');
-    expect(existsSync(path.join(acmeDir, 'shopify', 'sync.json'))).toBe(true);
-  });
-
-  it('still applies the @-prefix, .history, and *.secrets.json skips when copying', async () => {
+  it('always skips *.secrets.json and .history/ at the catalog source', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('providers', 'openai.json'), '{"name":"openai"}');
     await writeText(
-      path.join(catalogRoot, 'providers', 'openai.json'),
-      '{"name":"openai"}',
-    );
-    await writeText(
-      path.join(catalogRoot, 'providers', 'openai.secrets.json'),
+      catSrc('providers', 'openai.secrets.json'),
       '{"key":"redacted"}',
     );
-    await writeText(
-      path.join(catalogRoot, 'providers', '.history', 'snapshot.json'),
-      '{}',
-    );
-    await writeText(
-      path.join(catalogRoot, 'providers', '@stale-tenant', 'leak.json'),
-      '{}',
-    );
+    await writeText(catSrc('providers', '.history', 'snapshot.json'), '{}');
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    const acmeDir = path.join(configRoot, 'providers', 'acme');
-    expect(existsSync(path.join(acmeDir, 'openai.json'))).toBe(true);
-    expect(existsSync(path.join(acmeDir, 'openai.secrets.json'))).toBe(false);
-    expect(existsSync(path.join(acmeDir, '.history'))).toBe(false);
-    expect(existsSync(path.join(acmeDir, '@stale-tenant'))).toBe(false);
+    expect(existsSync(orgDst('acme', 'providers', 'openai.json'))).toBe(true);
+    expect(existsSync(orgDst('acme', 'providers', 'openai.secrets.json'))).toBe(
+      false,
+    );
+    expect(existsSync(orgDst('acme', 'providers', '.history'))).toBe(false);
   });
 
-  it('is per-domain idempotent: a domain dir that already has files is skipped', async () => {
+  it('is per-domain idempotent: a domain dir that already has files is skipped (override:false)', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('workflows', 'shipped.json'), '{"name":"shipped"}');
+    // Pre-existing org content — scaffold must not overwrite without override.
     await writeText(
-      path.join(catalogRoot, 'workflows', 'shipped.json'),
-      '{"name":"shipped"}',
+      orgDst('acme', 'workflows', 'existing.json'),
+      '{"name":"existing"}',
     );
-    // Pre-existing org content — scaffold must not overwrite.
-    const acmeDir = path.join(configRoot, 'workflows', '@acme');
-    await writeText(path.join(acmeDir, 'existing.json'), '{"name":"existing"}');
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    expect(await readFile(path.join(acmeDir, 'existing.json'), 'utf-8')).toBe(
-      '{"name":"existing"}',
-    );
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(false);
+    expect(
+      await readFile(orgDst('acme', 'workflows', 'existing.json'), 'utf-8'),
+    ).toBe('{"name":"existing"}');
+    expect(existsSync(orgDst('acme', 'workflows', 'shipped.json'))).toBe(false);
   });
 
   it('treats a target containing only .history/ as occupied (no re-seed on top of user edit trail)', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('workflows', 'shipped.json'), '{"name":"shipped"}');
     await writeText(
-      path.join(catalogRoot, 'workflows', 'shipped.json'),
-      '{"name":"shipped"}',
-    );
-    // Realistic state: user created the org, edited a workflow (writing
-    // `.history/<slug>/<rev>.json`), then deleted the visible workflow.
-    // Re-scaffolding (e.g., via the backfill migration) must NOT silently
-    // re-seed the catalog on top of the surviving edit trail.
-    const acmeDir = path.join(configRoot, 'workflows', '@acme');
-    await writeText(
-      path.join(acmeDir, '.history', 'old.json'),
+      orgDst('acme', 'workflows', '.history', 'old.json'),
       '{"snapshot":1}',
     );
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(false);
-    expect(existsSync(path.join(acmeDir, '.history', 'old.json'))).toBe(true);
+    expect(existsSync(orgDst('acme', 'workflows', 'shipped.json'))).toBe(false);
+    expect(
+      existsSync(orgDst('acme', 'workflows', '.history', 'old.json')),
+    ).toBe(true);
   });
 
   it('ignores atomicWrite tmp orphans so a crashed scaffold can retry', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
-    await writeText(
-      path.join(catalogRoot, 'workflows', 'shipped.json'),
-      '{"name":"shipped"}',
-    );
+    await writeText(catSrc('workflows', 'shipped.json'), '{"name":"shipped"}');
     // Simulate the residue a prior crashed scaffold would leave behind:
     // atomicWrite uses `.<basename>.<ts>.<uuid>.tmp` and cleans up on
     // success, but a crash mid-write leaves the tmp orphan in place.
-    const acmeDir = path.join(configRoot, 'workflows', '@acme');
     await writeText(
-      path.join(acmeDir, '.shipped.json.1700000000000.deadbeef.tmp'),
+      orgDst('acme', 'workflows', '.shipped.json.1700000000000.deadbeef.tmp'),
       'partial',
     );
 
     await scaffoldHandler({} as never, { orgSlug: 'acme' });
 
-    expect(existsSync(path.join(acmeDir, 'shipped.json'))).toBe(true);
+    expect(existsSync(orgDst('acme', 'workflows', 'shipped.json'))).toBe(true);
   });
 
   it('logs error when TALE_CONFIG_BUILTIN_DIR points at a missing path (deploy misconfig)', async () => {
-    // Builtin root configured but the directory doesn't exist on disk —
-    // simulates platform/convex image version skew or a missing volume mount.
     process.env.TALE_CONFIG_BUILTIN_DIR = path.join(catalogRoot, 'missing');
     const errSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
 
@@ -307,25 +231,212 @@ describe('scaffoldNewOrganization', () => {
             m.includes('does not exist'),
         ),
       ).toBe(true);
-      // Target should remain empty — no silent fallback to default-org dir.
-      expect(existsSync(path.join(configRoot, 'workflows', '@acme'))).toBe(
-        false,
-      );
+      expect(existsSync(orgDst('acme', 'workflows'))).toBe(false);
     } finally {
       errSpy.mockRestore();
     }
   });
 
-  it('returns null without scaffolding the default org', async () => {
+  it('default org IS scaffold-able under org-first (no longer early-returned)', async () => {
     process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
-    await writeText(path.join(catalogRoot, 'workflows', 'shipped.json'), '{}');
+    await writeText(catSrc('agents', 'shipped.json'), '{"displayName":"x"}');
 
-    const result = await scaffoldHandler({} as never, { orgSlug: 'default' });
+    await scaffoldHandler({} as never, { orgSlug: 'default' });
 
-    expect(result).toBeNull();
-    // Default org's workspace must not have been touched by scaffold.
-    expect(existsSync(path.join(configRoot, 'workflows', 'shipped.json'))).toBe(
-      false,
+    expect(existsSync(orgDst('default', 'agents', 'shipped.json'))).toBe(true);
+  });
+
+  it('override:true overwrites flat-domain files while preserving secrets and .history', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('agents', 'shipped.json'), '{"displayName":"new"}');
+
+    // Pre-existing org state: user-edited shipped, user-added file, secret, history.
+    await writeText(
+      orgDst('acme', 'agents', 'shipped.json'),
+      '{"displayName":"user-edited"}',
     );
+    await writeText(
+      orgDst('acme', 'agents', 'user-added.json'),
+      '{"displayName":"keep me"}',
+    );
+    await writeText(
+      orgDst('acme', 'agents', 'openai.secrets.json'),
+      '{"key":"keep-me-too"}',
+    );
+    await writeText(
+      orgDst('acme', 'agents', '.history', 'shipped', '1.json'),
+      '{"rev":1}',
+    );
+
+    await scaffoldHandler({} as never, { orgSlug: 'acme', override: true });
+
+    // Catalog file overwritten.
+    expect(
+      await readFile(orgDst('acme', 'agents', 'shipped.json'), 'utf-8'),
+    ).toBe('{"displayName":"new"}');
+    // User-added file survived.
+    expect(existsSync(orgDst('acme', 'agents', 'user-added.json'))).toBe(true);
+    // Secret + history survived.
+    expect(
+      await readFile(orgDst('acme', 'agents', 'openai.secrets.json'), 'utf-8'),
+    ).toBe('{"key":"keep-me-too"}');
+    expect(
+      existsSync(orgDst('acme', 'agents', '.history', 'shipped', '1.json')),
+    ).toBe(true);
+  });
+
+  it('override:true for dir-bundle domains (skills) rm-replaces the bundle but preserves dir-level secrets/.history', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('skills', 'code-reviewer', 'SKILL.md'), 'new');
+
+    // Pre-existing bundle: user-edited SKILL.md + a user-added file inside
+    // the bundle (gets wiped); domain-level .history + secrets survive.
+    await writeText(
+      orgDst('acme', 'skills', 'code-reviewer', 'SKILL.md'),
+      'user-edited',
+    );
+    await writeText(
+      orgDst('acme', 'skills', 'code-reviewer', 'user-extra.txt'),
+      'gone after override',
+    );
+    await writeText(
+      orgDst('acme', 'skills', '.history', 'code-reviewer', '1.md'),
+      'old rev',
+    );
+
+    await scaffoldHandler({} as never, { orgSlug: 'acme', override: true });
+
+    expect(
+      await readFile(
+        orgDst('acme', 'skills', 'code-reviewer', 'SKILL.md'),
+        'utf-8',
+      ),
+    ).toBe('new');
+    expect(
+      existsSync(orgDst('acme', 'skills', 'code-reviewer', 'user-extra.txt')),
+    ).toBe(false);
+    expect(
+      existsSync(orgDst('acme', 'skills', '.history', 'code-reviewer', '1.md')),
+    ).toBe(true);
+  });
+
+  it('override:true for workflows preserves user-only folders', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(
+      catSrc('workflows', 'shopify', 'sync.json'),
+      '{"name":"new"}',
+    );
+
+    await writeText(
+      orgDst('acme', 'workflows', 'shopify', 'sync.json'),
+      '{"name":"old"}',
+    );
+    await writeText(
+      orgDst('acme', 'workflows', 'my-folder', 'custom.json'),
+      '{"name":"custom"}',
+    );
+
+    await scaffoldHandler({} as never, { orgSlug: 'acme', override: true });
+
+    expect(
+      await readFile(
+        orgDst('acme', 'workflows', 'shopify', 'sync.json'),
+        'utf-8',
+      ),
+    ).toBe('{"name":"new"}');
+    expect(
+      existsSync(orgDst('acme', 'workflows', 'my-folder', 'custom.json')),
+    ).toBe(true);
+  });
+
+  it('seeds retention as a single file at <org>/retention.json', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(
+      catSrc('retention.json'),
+      '{"version":"v1","categories":{}}',
+    );
+
+    await scaffoldHandler({} as never, { orgSlug: 'acme' });
+
+    expect(existsSync(orgDst('acme', 'retention.json'))).toBe(true);
+    expect(await readFile(orgDst('acme', 'retention.json'), 'utf-8')).toBe(
+      '{"version":"v1","categories":{}}',
+    );
+  });
+
+  it('copy-onto-self guard fires for default-org reseed in dev fallback (catalog env unset)', async () => {
+    // No TALE_CONFIG_BUILTIN_DIR → fallback source = domain.resolve('default')
+    // = `<root>/default/<domain>`, which is the same dir as the reseed
+    // target. realpath-based guard must catch this even though path strings
+    // are syntactically identical.
+    await writeText(
+      orgDst('default', 'workflows', 'shopify', 'sync.json'),
+      '{"name":"existing"}',
+    );
+
+    // Should be a no-op (skip with warn), not a destructive copy-onto-self.
+    await scaffoldHandler({} as never, {
+      orgSlug: 'default',
+      override: true,
+    });
+
+    expect(
+      await readFile(
+        orgDst('default', 'workflows', 'shopify', 'sync.json'),
+        'utf-8',
+      ),
+    ).toBe('{"name":"existing"}');
+  });
+});
+
+describe('cleanupOrgFilesystem (symlink + traversal defense)', () => {
+  it('refuses the literal `default` slug', async () => {
+    await writeText(orgDst('default', 'agents', 'x.json'), '{}');
+    await cleanupHandler({} as never, { orgSlug: 'default' });
+    expect(existsSync(orgDst('default', 'agents', 'x.json'))).toBe(true);
+  });
+
+  it('removes the entire <org>/ subtree for a valid non-default slug', async () => {
+    await writeText(orgDst('acme', 'agents', 'x.json'), '{}');
+    await writeText(orgDst('acme', 'providers', 'p.json'), '{}');
+    await writeText(orgDst('other', 'agents', 'keep.json'), '{}');
+
+    await cleanupHandler({} as never, { orgSlug: 'acme' });
+
+    expect(existsSync(orgDst('acme'))).toBe(false);
+    expect(existsSync(orgDst('other', 'agents', 'keep.json'))).toBe(true);
+  });
+
+  it('ENOENT on the org dir is idempotent (no throw)', async () => {
+    // Org dir doesn't exist; cleanup should silently succeed.
+    await expect(
+      cleanupHandler({} as never, { orgSlug: 'never-existed' }),
+    ).resolves.toBeNull();
+  });
+
+  it('refuses invalid org slugs (would have already failed at validateOrgSlug too)', async () => {
+    // Slugs that don't match ORG_SLUG_REGEX. cleanup must warn-and-skip.
+    await cleanupHandler({} as never, { orgSlug: '../escape' });
+    await cleanupHandler({} as never, { orgSlug: 'UPPER' });
+    // No assertion needed on filesystem — we're verifying no throw.
+  });
+
+  it('refuses a symlinked org dir (would otherwise rm the symlink target)', async () => {
+    // Create a directory outside configRoot, then place a symlink at
+    // configRoot/acme pointing to it. cleanup must lstat → detect symlink → refuse.
+    const outside = await mkdtemp(path.join(tmpdir(), 'cleanup-outside-'));
+    const outsideFile = path.join(outside, 'precious.json');
+    await writeFile(outsideFile, '{"keep":"me"}', 'utf-8');
+
+    await symlink(outside, orgDst('acme'));
+
+    try {
+      await cleanupHandler({} as never, { orgSlug: 'acme' });
+      // The symlink target's file MUST survive.
+      expect(existsSync(outsideFile)).toBe(true);
+    } finally {
+      await rm(orgDst('acme'), { force: true });
+      await rm(outside, { recursive: true, force: true });
+    }
   });
 });
diff --git a/services/platform/convex/organizations/scaffold.ts b/services/platform/convex/organizations/scaffold.ts
index f933eea63c..e784d3b6e5 100644
--- a/services/platform/convex/organizations/scaffold.ts
+++ b/services/platform/convex/organizations/scaffold.ts
@@ -1,36 +1,55 @@
 'use node';
 
 /**
- * Scaffold per-org filesystem config on organization creation.
+ * Scaffold + cleanup per-org filesystem config under the uniform org-first
+ * layout (`$TALE_CONFIG_DIR/<orgSlug>/<domain>/...` for every org incl.
+ * `default`). Source of seed data is the immutable builtin catalog baked
+ * into the convex image at `$TALE_CONFIG_BUILTIN_DIR/default/<domain>/`
+ * (set in services/platform/Dockerfile, propagated via the entrypoint's
+ * `convex env set` loop). Falls back to the default org's writable dir
+ * when the env is unset, so local `bun dev` (no catalog) still works.
  *
- * Seeds new orgs from the immutable builtin catalog baked into the convex
- * image at `$TALE_CONFIG_BUILTIN_DIR/<domain>/` (mirrors the writable
- * `$TALE_CONFIG_DIR/<domain>/` pattern). The env is pushed by the platform
- * Dockerfile via the entrypoint's `convex env set` loop. Falls back to the
- * default org's writable dir when the env is unset, so local `bun dev`
- * (where no catalog is built) still works. The rationale for sourcing from
- * the read-only catalog instead of the default workspace lives at the
- * `@`-prefix-skip comment in copyTree below — that's the load-bearing site.
+ * `scaffoldNewOrganization`:
+ *   - org-create path (`override:false`, default): idempotent per-domain
+ *     skip if the target dir already has files.
+ *   - reseed path (`override:true`, called by `reseedAllOrgsFromBuiltin`):
+ *     overwrites builtin-named files in place while always preserving
+ *     `*.secrets.json` and `.history/` trails. Per-domain semantics —
+ *     flat: per-file atomicWrite; dir-bundle (skills/integrations):
+ *     `rm -rf <per-bundle>` then copy bundle; workflows + branding:
+ *     per-file overwrite (preserves user-only folders / images);
+ *     retention: single-file copy.
  *
- * Skips per-org secrets (`*.secrets.json`) and local edit-history dirs
- * (`.history/`). Skips branding entirely — read-side hardcodes 'default'.
- *
- * Idempotent: if the target dir already contains user-visible files, skip
- * that domain with a warning rather than overwriting.
+ * `cleanupOrgFilesystem` removes the entire `<orgSlug>/` subtree (org is
+ * one tree under org-first), guarded by validateOrgSlug + verifyPathWithinBase
+ * + an lstat symlink defense (an attacker-placed symlink at the org dir
+ * would otherwise be followed by `rm -rf` to arbitrary filesystem
+ * locations). Uses a two-phase rename-then-delete so concurrent writers
+ * fail with ENOENT rather than racing the recursive delete.
  */
 
-import { lstat, readdir, readFile, rm, stat } from 'node:fs/promises';
+import {
+  lstat,
+  readdir,
+  readFile,
+  realpath,
+  rename,
+  rm,
+  stat,
+} from 'node:fs/promises';
 import path from 'node:path';
 
 import { v } from 'convex/values';
 
 import { internalAction } from '../_generated/server';
 import { resolveAgentsDir } from '../agents/file_utils';
+import { resolveBrandingDir } from '../branding/file_utils';
 import { resolveIntegrationsDir } from '../integrations/file_utils';
 import {
   atomicWrite,
   atomicWriteBuffer,
   errnoCode,
+  validateOrgSlug,
   verifyPathWithinBase,
 } from '../lib/file_io';
 import { resolveProvidersDir } from '../providers/file_utils';
@@ -42,20 +61,30 @@ type DirResolver = (orgSlug: string) => string;
 type Domain = {
   name: string;
   resolve: DirResolver;
-  // Flat domains store one file per item with no subdirectories in the
-  // catalog (agents/providers: `<slug>.json`). copyTree must not recurse into
-  // subdirs for these — see the `allowSubdirs` guard in copyTree.
-  flat?: boolean;
+  // 'flat' = one file per item, no subdirs in the catalog (agents/providers/branding).
+  //   override:true overwrites per-file via atomicWrite; user-added files survive,
+  //   secrets + .history at the dir level survive.
+  // 'bundle' = per-item directory bundle (skills/integrations). override:true
+  //   rm -rf's the per-bundle subdir then copies — wholesale bundle replace.
+  //   Dir-level `.history`/secrets at the domain root (siblings of bundles) survive.
+  // 'tree' = arbitrary nested files (workflows). override:true per-file overwrite;
+  //   user-only folders survive.
+  kind: 'flat' | 'bundle' | 'tree';
 };
 
-// Each domain's per-org dir convention differs — use the domain's own resolver.
-// The catalog subdir name matches `name` (e.g., `$TALE_CONFIG_BUILTIN_DIR/agents/`).
+// `default` is the canonical template org in the catalog; the catalog tree
+// at `$TALE_CONFIG_BUILTIN_DIR/default/<domain>/` is the source for every
+// org including default itself.
 const DOMAINS: Domain[] = [
-  { name: 'agents', resolve: resolveAgentsDir, flat: true },
-  { name: 'providers', resolve: resolveProvidersDir, flat: true },
-  { name: 'integrations', resolve: resolveIntegrationsDir },
-  { name: 'workflows', resolve: resolveWorkflowsDir },
-  { name: 'skills', resolve: resolveSkillsDir },
+  { name: 'agents', resolve: resolveAgentsDir, kind: 'flat' },
+  { name: 'providers', resolve: resolveProvidersDir, kind: 'flat' },
+  { name: 'integrations', resolve: resolveIntegrationsDir, kind: 'bundle' },
+  { name: 'workflows', resolve: resolveWorkflowsDir, kind: 'tree' },
+  { name: 'skills', resolve: resolveSkillsDir, kind: 'bundle' },
+  // Branding is logically a tree (branding.json + images/ subdir). Per-file
+  // overwrite is correct: catalog overwrites branding.json; uploaded
+  // `images/*.png` survive (they're neither secrets nor .history).
+  { name: 'branding', resolve: resolveBrandingDir, kind: 'tree' },
 ];
 
 const BUILTIN_ENV = 'TALE_CONFIG_BUILTIN_DIR';
@@ -70,7 +99,7 @@ function shouldSkipFile(name: string): boolean {
 // atomicWrite leaves `.<basename>.<ts>.<uuid>.tmp` orphans on crash. Those
 // shouldn't lock out a retry, but every other entry (including dotfiles
 // like `.history/` that agents/workflows write on every edit) means a user
-// has been here and we must not overwrite.
+// has been here and we must not overwrite in the non-override path.
 function isAtomicWriteTmp(name: string): boolean {
   return name.startsWith('.') && name.endsWith('.tmp');
 }
@@ -80,10 +109,6 @@ async function dirHasFiles(dir: string): Promise<boolean> {
     const entries = await readdir(dir);
     return entries.some((n) => !isAtomicWriteTmp(n));
   } catch (err) {
-    // ENOENT (dir doesn't exist yet) is the expected case — domain scaffold
-    // simply hasn't run. Anything else (EACCES, EIO) means we can't read
-    // it; treat as "empty" so scaffolding proceeds, but log so a
-    // permissions glitch isn't silently masked.
     if (errnoCode(err) !== 'ENOENT') {
       console.warn('[scaffold.dirHasFiles] readdir failed:', dir, err);
     }
@@ -91,6 +116,56 @@ async function dirHasFiles(dir: string): Promise<boolean> {
   }
 }
 
+/**
+ * realpath-aware equality / containment check. `path.resolve` only
+ * canonicalizes `..`/`.` — it does NOT follow symlinks. A symlinked
+ * `TALE_CONFIG_BUILTIN_DIR` (or bind-mount overlap between src/dst)
+ * could otherwise produce a copy-onto-self where `rm -rf <bundle>` then
+ * copy from the same dir wipes the live data. Use `realpath` on both
+ * sides; treat ENOENT on either side as "not yet a symlink concern"
+ * and fall back to `path.resolve`.
+ */
+async function pathsOverlap(a: string, b: string): Promise<boolean> {
+  const resolveReal = async (p: string): Promise<string> => {
+    try {
+      return await realpath(p);
+    } catch {
+      return path.resolve(p);
+    }
+  };
+  const realA = await resolveReal(a);
+  const realB = await resolveReal(b);
+  if (realA === realB) return true;
+  if (realA.startsWith(realB + path.sep)) return true;
+  if (realB.startsWith(realA + path.sep)) return true;
+  return false;
+}
+
+async function writeFileFromCatalog(src: string, dst: string): Promise<void> {
+  const buf = await readFile(src);
+  const name = path.basename(src);
+  if (
+    name.endsWith('.json') ||
+    name.endsWith('.ts') ||
+    name.endsWith('.svg') ||
+    name.endsWith('.md')
+  ) {
+    await atomicWrite(dst, buf.toString('utf-8'));
+  } else {
+    await atomicWriteBuffer(dst, buf);
+  }
+}
+
+/**
+ * Recursively copy `sourceDir` → `targetDir`. Skips `.history/`, dotfiles
+ * (`.<name>`), `*.secrets.json`, and symlinks at every level. Used by
+ * `tree` and (top-level) `bundle` domain seeds.
+ *
+ * `allowSubdirs=false` (used by flat domains) means: don't recurse into
+ * any subdir found in the source. The catalog for flat domains has no
+ * subdirs, so a subdir indicates a fallback workspace with leaked
+ * cross-tenant content — skip with a warning rather than recurse.
+ */
 async function copyTree(
   sourceDir: string,
   targetDir: string,
@@ -106,15 +181,6 @@ async function copyTree(
 
   for (const name of entries) {
     if (name.startsWith('.')) continue;
-    // Per-org marker prefix used by skills / integrations / workflows for
-    // tenant subdirs (`@<orgSlug>/...`). Defence-in-depth: the builtin
-    // catalog has no `@` subdirs, but if the source ever falls back to a
-    // mutable workspace this guard prevents recursing into other orgs'
-    // trees. Agents / providers use raw `<slug>` subdirs (no `@` marker) and
-    // are flat-copied (`allowSubdirs=false` below), so a stray raw-slug subdir
-    // in a fallback workspace is never recursed into either — the cross-tenant
-    // leak is structurally impossible on any source path.
-    if (name.startsWith('@')) continue;
     if (SKIP_DIR_NAMES.has(name)) continue;
     if (shouldSkipFile(name)) continue;
 
@@ -122,9 +188,8 @@ async function copyTree(
     const dst = path.join(targetDir, name);
 
     // lstat (not stat) so a symlink in the source is detected and skipped
-    // rather than followed. The catalog is built from `examples/` which
-    // tracks no symlinks today, but this keeps the scaffold from
-    // dereferencing through to arbitrary paths if one is ever introduced.
+    // rather than followed. The catalog tracks no symlinks today; this
+    // keeps the scaffold from dereferencing if one is ever introduced.
     const info = await lstat(src).catch((err) => {
       if (errnoCode(err) !== 'ENOENT') {
         console.warn('[scaffold.copyTree] lstat failed:', src, err);
@@ -139,9 +204,6 @@ async function copyTree(
 
     if (info.isDirectory()) {
       if (!allowSubdirs) {
-        // Flat domain (agents / providers): the catalog has no subdirs here,
-        // so any subdir is unexpected (e.g. a raw-slug org dir leaked into a
-        // mutable fallback workspace). Skip rather than recurse.
         console.warn(
           '[scaffold.copyTree] skipping unexpected subdir in flat domain:',
           src,
@@ -153,30 +215,188 @@ async function copyTree(
     }
 
     if (!info.isFile()) continue;
+    await writeFileFromCatalog(src, dst);
+  }
+}
 
-    const buf = await readFile(src);
-    if (
-      name.endsWith('.json') ||
-      name.endsWith('.ts') ||
-      name.endsWith('.svg')
-    ) {
-      await atomicWrite(dst, buf.toString('utf-8'));
+/**
+ * Seed a single domain for an org. Source is `<catalogRoot>/default/<domain>`
+ * (canonical template) when `TALE_CONFIG_BUILTIN_DIR` is set, falling back
+ * to `resolve('default')` for local dev. Returns true on success, false on
+ * skip/failure.
+ */
+async function seedDomain(
+  domain: Domain,
+  catalogRoot: string | undefined,
+  orgSlug: string,
+  override: boolean,
+): Promise<void> {
+  const sourceDir = catalogRoot
+    ? path.join(catalogRoot, 'default', domain.name)
+    : domain.resolve('default');
+  const targetDir = domain.resolve(orgSlug);
+
+  if (catalogRoot) {
+    // Operator-set catalog path must exist; missing = deploy misconfig
+    // (platform/convex image version skew). Surface in logs instead of
+    // silent zero-seed.
+    const sourceExists = await stat(sourceDir)
+      .then(() => true)
+      .catch((err) => {
+        if (errnoCode(err) === 'ENOENT') {
+          console.error(
+            `[scaffold] ${domain.name}: ${BUILTIN_ENV}=${catalogRoot} is set but ${sourceDir} does not exist; org "${orgSlug}" will receive zero seed data for this domain`,
+          );
+        } else {
+          console.error(
+            `[scaffold] ${domain.name}: stat ${sourceDir} failed:`,
+            err instanceof Error ? err.message : err,
+          );
+        }
+        return false;
+      });
+    if (!sourceExists) return;
+  }
+
+  // copy-onto-self guard: realpath-aware. Fires for default-org reseed
+  // in the fallback case (catalog env unset, source = target) and for
+  // any symlinked overlap between catalog and data trees.
+  if (await pathsOverlap(sourceDir, targetDir)) {
+    console.warn(
+      `[scaffold] ${domain.name}: source and target overlap (${sourceDir} ↔ ${targetDir}); skipping`,
+    );
+    return;
+  }
+
+  if (!override) {
+    const alreadyScaffolded = await dirHasFiles(targetDir);
+    if (alreadyScaffolded) {
+      console.warn(
+        `[scaffold] ${domain.name}: target ${targetDir} already has files, skipping (use override:true to reseed)`,
+      );
+      return;
+    }
+  }
+
+  try {
+    if (domain.kind === 'flat') {
+      // Per-file atomicWrite. Overwrites only catalog-named files; user-added
+      // files at the same dir survive (e.g., an org's custom agent). Dir-level
+      // `.history`/secrets survive (copyTree skips them at the source side,
+      // and per-file write doesn't touch siblings).
+      await copyTree(sourceDir, targetDir, /* allowSubdirs */ false);
+    } else if (domain.kind === 'bundle') {
+      // For each catalog bundle subdir, rm -rf the corresponding target
+      // bundle (if override) then copy. Domain-root siblings (.history/,
+      // *.secrets.json at the domain dir level) survive — we only touch
+      // bundle subdirs that exist in the catalog.
+      let bundles: string[];
+      try {
+        bundles = await readdir(sourceDir);
+      } catch (err) {
+        if (errnoCode(err) === 'ENOENT') return;
+        throw err;
+      }
+      for (const bundleName of bundles) {
+        if (bundleName.startsWith('.')) continue;
+        if (SKIP_DIR_NAMES.has(bundleName)) continue;
+        const bundleSrc = path.join(sourceDir, bundleName);
+        const bundleDst = path.join(targetDir, bundleName);
+        const info = await lstat(bundleSrc).catch(() => null);
+        if (!info || info.isSymbolicLink() || !info.isDirectory()) continue;
+        if (override) {
+          await rm(bundleDst, { recursive: true, force: true });
+        }
+        await copyTree(bundleSrc, bundleDst, /* allowSubdirs */ true);
+      }
     } else {
-      await atomicWriteBuffer(dst, buf);
+      // 'tree' — workflows + branding. Per-file overwrite, no rm. User-only
+      // subdirs / files survive intact (e.g. an org's custom workflow folder,
+      // an uploaded branding/images/logo.png).
+      await copyTree(sourceDir, targetDir, /* allowSubdirs */ true);
     }
+  } catch (err) {
+    console.error(
+      `[scaffold] ${domain.name}: copy failed for org "${orgSlug}":`,
+      err instanceof Error ? err.message : err,
+    );
+    // Continue with other domains; partial scaffolding is better than none.
+  }
+}
+
+/**
+ * Retention is one JSON object per org (`<orgSlug>/retention.json`), not a
+ * subtree. Special-cased outside the DOMAINS loop.
+ */
+async function seedRetention(
+  catalogRoot: string | undefined,
+  orgSlug: string,
+  override: boolean,
+): Promise<void> {
+  const sourceFile = catalogRoot
+    ? path.join(catalogRoot, 'default', 'retention.json')
+    : path.join(process.env.TALE_CONFIG_DIR ?? '', 'default', 'retention.json');
+  const targetFile = path.join(
+    process.env.TALE_CONFIG_DIR ?? '',
+    orgSlug,
+    'retention.json',
+  );
+
+  const sourceExists = await stat(sourceFile)
+    .then(() => true)
+    .catch((err) => {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn('[scaffold] retention: stat failed:', sourceFile, err);
+      }
+      return false;
+    });
+  if (!sourceExists) return;
+
+  if (await pathsOverlap(sourceFile, targetFile)) {
+    console.warn(`[scaffold] retention: source and target overlap; skipping`);
+    return;
+  }
+
+  const targetExists = await stat(targetFile)
+    .then(() => true)
+    .catch(() => false);
+  if (targetExists && !override) {
+    console.warn(
+      `[scaffold] retention: target ${targetFile} exists, skipping (use override:true to reseed)`,
+    );
+    return;
+  }
+
+  try {
+    const buf = await readFile(sourceFile);
+    await atomicWrite(targetFile, buf.toString('utf-8'));
+  } catch (err) {
+    console.error(
+      `[scaffold] retention: copy failed for org "${orgSlug}":`,
+      err instanceof Error ? err.message : err,
+    );
   }
 }
 
 /**
- * Remove a deleted org's per-domain filesystem dirs. Safety:
- * - Refuses the `default` slug (the global/system org's baseline).
- * - Uses each domain's own resolver so we only touch paths that follow
- *   the established convention (no manual string-building).
- * - Verifies the resolved per-org dir is strictly inside the domain's
- *   base dir via `verifyPathWithinBase` — blocks slug traversal like
- *   `../foo` even though `validateOrgSlug` should have already caught it.
- * - ENOENT on the per-org dir is silently ignored (idempotent; nothing
- *   to clean up).
+ * Remove a deleted org's entire `<orgSlug>/` subtree under
+ * `${TALE_CONFIG_DIR}`. Safety:
+ * - TALE_CONFIG_DIR must be set + absolute.
+ * - Refuses the literal `default` slug.
+ * - Validates the slug via `validateOrgSlug` so a NULL / `..` / cased
+ *   slug from a misbehaving caller can't slip through.
+ * - `verifyPathWithinBase` enforces strict descendant-of-root containment.
+ * - `lstat`-refuses a symlink at the org dir itself: `verifyPathWithinBase`
+ *   only realpath's the dirname, so a pre-placed symlink at
+ *   `<root>/<orgSlug>` would otherwise be followed by `rm -rf` to
+ *   arbitrary filesystem locations.
+ * - Two-phase rename-then-delete: rename to a `.deleted-<slug>-<ts>`
+ *   sibling first (atomic), then `rm -rf` the renamed path. Concurrent
+ *   writers of the original path fail with ENOENT instead of racing
+ *   the recursive delete.
+ * - Drops `{ force: true }` — `force` masks EACCES/EBUSY silently;
+ *   surface errors via the explicit ENOENT branch + error logging.
+ * - ENOENT on the org dir is idempotent (nothing to clean up).
  */
 export const cleanupOrgFilesystem = internalAction({
   args: {
@@ -184,6 +404,14 @@ export const cleanupOrgFilesystem = internalAction({
   },
   returns: v.null(),
   handler: async (_ctx, args) => {
+    const root = process.env.TALE_CONFIG_DIR;
+    if (!root || !path.isAbsolute(root)) {
+      console.error(
+        '[cleanupOrgFilesystem] TALE_CONFIG_DIR is unset or not absolute; refusing to proceed',
+      );
+      return null;
+    }
+
     if (args.orgSlug === 'default') {
       console.warn(
         '[cleanupOrgFilesystem] refusing to delete the default org filesystem',
@@ -191,47 +419,73 @@ export const cleanupOrgFilesystem = internalAction({
       return null;
     }
 
-    for (const domain of DOMAINS) {
-      const baseDir = domain.resolve('default');
-      let targetDir: string;
-      try {
-        targetDir = domain.resolve(args.orgSlug);
-      } catch (err) {
-        console.warn(
-          `[cleanupOrgFilesystem] ${domain.name}: skipping invalid slug "${args.orgSlug}":`,
-          err instanceof Error ? err.message : err,
-        );
-        continue;
-      }
+    if (!validateOrgSlug(args.orgSlug)) {
+      console.warn(
+        `[cleanupOrgFilesystem] refusing invalid slug "${args.orgSlug}"`,
+      );
+      return null;
+    }
 
-      // The default-org's base dir is the per-domain baseDir itself; a
-      // per-org dir must be a strict descendant, never equal.
-      if (targetDir === baseDir) {
-        console.warn(
-          `[cleanupOrgFilesystem] ${domain.name}: target equals base dir, skipping`,
-        );
-        continue;
-      }
+    const orgDir = path.join(root, args.orgSlug);
+    if (path.resolve(orgDir) === path.resolve(root)) {
+      console.warn(
+        `[cleanupOrgFilesystem] computed orgDir equals root, refusing`,
+      );
+      return null;
+    }
 
-      try {
-        await verifyPathWithinBase(targetDir, baseDir);
-      } catch (err) {
-        console.warn(
-          `[cleanupOrgFilesystem] ${domain.name}: path traversal guard tripped for "${args.orgSlug}":`,
-          err instanceof Error ? err.message : err,
-        );
-        continue;
-      }
+    try {
+      await verifyPathWithinBase(orgDir, root);
+    } catch (err) {
+      console.warn(
+        `[cleanupOrgFilesystem] path traversal guard tripped for "${args.orgSlug}":`,
+        err instanceof Error ? err.message : err,
+      );
+      return null;
+    }
 
-      try {
-        await rm(targetDir, { recursive: true, force: true });
-      } catch (err) {
-        if (errnoCode(err) === 'ENOENT') continue;
-        console.error(
-          `[cleanupOrgFilesystem] ${domain.name}: failed to remove "${targetDir}":`,
-          err instanceof Error ? err.message : err,
-        );
-      }
+    // Symlink hijack defense: verifyPathWithinBase leaves the basename
+    // unresolved. If <root>/<orgSlug> is itself a symlink (placed by an
+    // attacker or a misconfigured operator), rm -rf would follow it and
+    // delete arbitrary filesystem locations. Refuse explicitly here.
+    const info = await lstat(orgDir).catch((err) => {
+      if (errnoCode(err) === 'ENOENT') return null;
+      console.warn(
+        `[cleanupOrgFilesystem] lstat failed for "${orgDir}":`,
+        err instanceof Error ? err.message : err,
+      );
+      return null;
+    });
+    if (!info) return null;
+    if (info.isSymbolicLink()) {
+      console.error(
+        `[cleanupOrgFilesystem] refusing to delete symlinked org dir at "${orgDir}"`,
+      );
+      return null;
+    }
+
+    // Two-phase rename-then-delete. The rename is atomic within a
+    // filesystem; any concurrent writer of the original path fails with
+    // ENOENT instead of racing the recursive delete.
+    const condemned = path.join(root, `.deleted-${args.orgSlug}-${Date.now()}`);
+    try {
+      await rename(orgDir, condemned);
+    } catch (err) {
+      if (errnoCode(err) === 'ENOENT') return null;
+      console.error(
+        `[cleanupOrgFilesystem] rename failed for "${orgDir}" → "${condemned}":`,
+        err instanceof Error ? err.message : err,
+      );
+      return null;
+    }
+
+    try {
+      await rm(condemned, { recursive: true });
+    } catch (err) {
+      console.error(
+        `[cleanupOrgFilesystem] rm failed for "${condemned}" (org dir was renamed but not fully removed; manual cleanup required):`,
+        err instanceof Error ? err.message : err,
+      );
     }
 
     return null;
@@ -241,74 +495,30 @@ export const cleanupOrgFilesystem = internalAction({
 export const scaffoldNewOrganization = internalAction({
   args: {
     orgSlug: v.string(),
+    /**
+     * When true, overwrite the catalog-named subset of files in each
+     * domain, preserving `*.secrets.json` and `.history/`. When false
+     * (default), skip per-domain if the target already has visible
+     * files (idempotent org-create path).
+     */
+    override: v.optional(v.boolean()),
   },
   returns: v.null(),
   handler: async (_ctx, args) => {
-    if (args.orgSlug === 'default') {
-      // The default org's files are seeded by the Docker entrypoint; nothing to do.
+    if (!validateOrgSlug(args.orgSlug)) {
+      console.warn(
+        `[scaffoldNewOrganization] refusing invalid slug "${args.orgSlug}"`,
+      );
       return null;
     }
 
-    const builtinRoot = process.env[BUILTIN_ENV];
+    const catalogRoot = process.env[BUILTIN_ENV];
+    const override = args.override ?? false;
 
     for (const domain of DOMAINS) {
-      // Prefer `$TALE_CONFIG_BUILTIN_DIR/<domain>/` (set by platform
-      // Dockerfile, pushed into Convex's deployment env). Falls back to
-      // the default org's dir when the env is unset — covers local
-      // `bun dev` (no catalog built) and a rollback to a platform image
-      // that doesn't declare the env.
-      const sourceDir = builtinRoot
-        ? path.join(builtinRoot, domain.name)
-        : domain.resolve('default');
-      const targetDir = domain.resolve(args.orgSlug);
-
-      // copyTree's ENOENT-silent contract is correct for the fallback case
-      // (default-org dir may legitimately not be seeded yet). But when an
-      // operator-configured catalog path doesn't exist, that's a deploy
-      // misconfig (e.g., platform/convex image version skew) and the
-      // resulting zero-seed should NOT look like a successful copy. Probe
-      // explicitly so the failure surfaces in logs.
-      if (builtinRoot) {
-        const sourceExists = await stat(sourceDir)
-          .then(() => true)
-          .catch((err) => {
-            // ENOENT: catalog domain dir missing — a deploy misconfig
-            // (platform/convex image skew). Other errors (EACCES, EIO) are a
-            // distinct failure; log each accurately rather than mislabelling
-            // a permission error as "does not exist".
-            if (errnoCode(err) === 'ENOENT') {
-              console.error(
-                `[scaffoldNewOrganization] ${domain.name}: ${BUILTIN_ENV}=${builtinRoot} is set but ${sourceDir} does not exist; new org "${args.orgSlug}" will receive zero seed data for this domain`,
-              );
-            } else {
-              console.error(
-                `[scaffoldNewOrganization] ${domain.name}: stat ${sourceDir} failed:`,
-                err instanceof Error ? err.message : err,
-              );
-            }
-            return false;
-          });
-        if (!sourceExists) continue;
-      }
-
-      const alreadyScaffolded = await dirHasFiles(targetDir);
-      if (alreadyScaffolded) {
-        console.warn(
-          `[scaffoldNewOrganization] ${domain.name}: target ${targetDir} already has files, skipping`,
-        );
-        continue;
-      }
-
-      try {
-        await copyTree(sourceDir, targetDir, !domain.flat);
-      } catch (err) {
-        console.error(
-          `[scaffoldNewOrganization] ${domain.name}: copy failed for org "${args.orgSlug}":`,
-          err instanceof Error ? err.message : err,
-        );
-        // Continue with other domains; partial scaffolding is better than none.
-      }
+      await seedDomain(domain, catalogRoot, args.orgSlug, override);
     }
+    await seedRetention(catalogRoot, args.orgSlug, override);
 
     return null;
   },
diff --git a/services/platform/convex/providers/file_utils.ts b/services/platform/convex/providers/file_utils.ts
index 531788a44e..a55c7fd2e0 100644
--- a/services/platform/convex/providers/file_utils.ts
+++ b/services/platform/convex/providers/file_utils.ts
@@ -92,22 +92,16 @@ export function parseProviderSecrets(
   return result.data;
 }
 
-function getBaseDir(): string {
-  const dir = process.env.PROVIDERS_DIR;
-  if (dir) return dir;
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'providers');
-  throw new Error(
-    'Neither TALE_CONFIG_DIR nor PROVIDERS_DIR environment variable is set.',
-  );
+  if (configDir) return configDir;
+  throw new Error('TALE_CONFIG_DIR environment variable is not set.');
 }
 
 export function resolveProvidersDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug))
     throw new Error(`Invalid org slug: ${orgSlug}`);
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') return baseDir;
-  return path.join(baseDir, orgSlug);
+  return path.join(getConfigRoot(), orgSlug, 'providers');
 }
 
 export function resolveProviderFilePath(
diff --git a/services/platform/convex/skills/file_actions.ts b/services/platform/convex/skills/file_actions.ts
index 3f043e9ed4..a46b693269 100644
--- a/services/platform/convex/skills/file_actions.ts
+++ b/services/platform/convex/skills/file_actions.ts
@@ -477,7 +477,7 @@ export const listSkills = action({
     const dir = resolveSkillsDir(orgSlug);
     const entries = await readdirSafe(dir);
     const slugs = entries.filter(
-      (e) => !e.startsWith('.') && !e.startsWith('@') && validateSkillSlug(e),
+      (e) => !e.startsWith('.') && validateSkillSlug(e),
     );
 
     const results = await Promise.all(
@@ -992,9 +992,7 @@ export const listSkillsForExecution = internalAction({
   handler: async (_ctx, args) => {
     const dir = resolveSkillsDir(args.orgSlug);
     const entries = await readdirSafe(dir);
-    return entries.filter(
-      (e) => !e.startsWith('.') && !e.startsWith('@') && validateSkillSlug(e),
-    );
+    return entries.filter((e) => !e.startsWith('.') && validateSkillSlug(e));
   },
 });
 
diff --git a/services/platform/convex/skills/file_utils.test.ts b/services/platform/convex/skills/file_utils.test.ts
index a4eee8c0fa..99d5813a23 100644
--- a/services/platform/convex/skills/file_utils.test.ts
+++ b/services/platform/convex/skills/file_utils.test.ts
@@ -13,30 +13,41 @@ import {
   validateSkillSlug,
 } from './file_utils';
 
-let skillsRoot: string;
-let prevSkillsDir: string | undefined;
+// Under the uniform org-first layout, every org's skills live at
+// `${TALE_CONFIG_DIR}/<orgSlug>/skills/` — including the default org
+// (which is no longer special-cased). All resolvers compose on top of
+// `${TALE_CONFIG_DIR}`; the per-domain SKILLS_DIR override has been dropped.
+let configRoot: string;
 let prevTaleConfigDir: string | undefined;
+let prevSkillsDir: string | undefined;
 
 beforeEach(async () => {
-  skillsRoot = await mkdtemp(path.join(tmpdir(), 'skills-test-'));
-  prevSkillsDir = process.env.SKILLS_DIR;
+  configRoot = await mkdtemp(path.join(tmpdir(), 'skills-test-'));
   prevTaleConfigDir = process.env.TALE_CONFIG_DIR;
-  process.env.SKILLS_DIR = skillsRoot;
-  delete process.env.TALE_CONFIG_DIR;
+  prevSkillsDir = process.env.SKILLS_DIR;
+  process.env.TALE_CONFIG_DIR = configRoot;
+  // Explicitly clear the legacy per-domain override so its presence in the
+  // shell env can't accidentally satisfy any leftover fallback.
+  delete process.env.SKILLS_DIR;
 });
 
 afterEach(async () => {
-  if (prevSkillsDir === undefined) {
-    delete process.env.SKILLS_DIR;
+  if (prevTaleConfigDir === undefined) {
+    delete process.env.TALE_CONFIG_DIR;
   } else {
-    process.env.SKILLS_DIR = prevSkillsDir;
-  }
-  if (prevTaleConfigDir !== undefined) {
     process.env.TALE_CONFIG_DIR = prevTaleConfigDir;
   }
-  await rm(skillsRoot, { recursive: true, force: true });
+  if (prevSkillsDir !== undefined) {
+    process.env.SKILLS_DIR = prevSkillsDir;
+  }
+  await rm(configRoot, { recursive: true, force: true });
 });
 
+// Helper: where this test's "default org skills dir" lives under org-first.
+function defaultSkillsDir(): string {
+  return path.join(configRoot, 'default', 'skills');
+}
+
 describe('validateSkillSlug', () => {
   it('accepts hyphen-separated lowercase slugs', () => {
     expect(validateSkillSlug('code-reviewer')).toBe(true);
@@ -67,14 +78,14 @@ describe('validateSkillSlug', () => {
   });
 });
 
-describe('resolveSkillsDir (org isolation)', () => {
-  it('default org uses base dir directly', () => {
-    expect(resolveSkillsDir('default')).toBe(skillsRoot);
+describe('resolveSkillsDir (org isolation, org-first)', () => {
+  it('default org lives at <root>/default/skills/', () => {
+    expect(resolveSkillsDir('default')).toBe(defaultSkillsDir());
   });
 
-  it('other orgs live under @<orgSlug>/', () => {
+  it('other orgs live at <root>/<orgSlug>/skills/ (no @-prefix)', () => {
     expect(resolveSkillsDir('acme-corp')).toBe(
-      path.join(skillsRoot, '@acme-corp'),
+      path.join(configRoot, 'acme-corp', 'skills'),
     );
   });
 
@@ -85,9 +96,9 @@ describe('resolveSkillsDir (org isolation)', () => {
 });
 
 describe('resolveSkillDir', () => {
-  it('returns path under skills root', () => {
+  it('returns path under <org>/skills/<slug>', () => {
     const p = resolveSkillDir('default', 'code-reviewer');
-    expect(p).toBe(path.join(skillsRoot, 'code-reviewer'));
+    expect(p).toBe(path.join(defaultSkillsDir(), 'code-reviewer'));
   });
 
   it('rejects invalid slugs upstream', () => {
@@ -99,7 +110,7 @@ describe('resolveSkillDir', () => {
 describe('resolveSkillMdPath', () => {
   it('appends SKILL.md', () => {
     expect(resolveSkillMdPath('default', 'code-reviewer')).toBe(
-      path.join(skillsRoot, 'code-reviewer', 'SKILL.md'),
+      path.join(defaultSkillsDir(), 'code-reviewer', 'SKILL.md'),
     );
   });
 });
@@ -112,7 +123,7 @@ describe('resolveSkillAssetPath (traversal hardening)', () => {
       'scripts/extract.py',
     );
     expect(p).toBe(
-      path.join(skillsRoot, 'pdf-extractor', 'scripts', 'extract.py'),
+      path.join(defaultSkillsDir(), 'pdf-extractor', 'scripts', 'extract.py'),
     );
   });
 
@@ -161,11 +172,11 @@ describe('resolveSkillAssetPath (traversal hardening)', () => {
 
 describe('resolveSkillAssetPathChecked (realpath / symlink defense)', () => {
   it('catches a symlink planted as an intermediate directory', async () => {
-    // skills/<slug>/escape → ../../outside
+    // <root>/default/skills/<slug>/escape → ../../../outside
     const slug = 'symlink-test';
-    const skillDir = path.join(skillsRoot, slug);
+    const skillDir = path.join(defaultSkillsDir(), slug);
     await mkdir(skillDir, { recursive: true });
-    const outside = path.join(skillsRoot, '..', 'outside');
+    const outside = path.join(configRoot, 'outside');
     await mkdir(outside, { recursive: true });
     await symlink(outside, path.join(skillDir, 'escape'));
 
@@ -178,7 +189,7 @@ describe('resolveSkillAssetPathChecked (realpath / symlink defense)', () => {
 
   it('allows asset reads through a real subdirectory', async () => {
     const slug = 'normal-test';
-    const dir = path.join(skillsRoot, slug, 'scripts');
+    const dir = path.join(defaultSkillsDir(), slug, 'scripts');
     await mkdir(dir, { recursive: true });
     await writeFile(path.join(dir, 'run.py'), 'print("ok")');
 
@@ -187,6 +198,8 @@ describe('resolveSkillAssetPathChecked (realpath / symlink defense)', () => {
       slug,
       'scripts/run.py',
     );
-    expect(resolved).toBe(path.join(skillsRoot, slug, 'scripts', 'run.py'));
+    expect(resolved).toBe(
+      path.join(defaultSkillsDir(), slug, 'scripts', 'run.py'),
+    );
   });
 });
diff --git a/services/platform/convex/skills/file_utils.ts b/services/platform/convex/skills/file_utils.ts
index e0fb07ad08..3af44e3b14 100644
--- a/services/platform/convex/skills/file_utils.ts
+++ b/services/platform/convex/skills/file_utils.ts
@@ -8,11 +8,11 @@
  * agents/file_utils.ts and integrations/file_utils.ts but uses Markdown +
  * YAML frontmatter as the wire format (per agentskills.io spec).
  *
- * Org isolation: default org sits at `${SKILLS_DIR}/`; other orgs live
- * under `${SKILLS_DIR}/@<orgSlug>/` — same `@` prefix convention used by
- * integrations. Every resolver applies a path-traversal guard plus a
- * `verifyPathWithinBase` realpath check so symlinks planted in the bundle
- * cannot escape the skill's directory.
+ * Org isolation: every org's skills live under
+ * `${TALE_CONFIG_DIR}/<orgSlug>/skills/` — uniform org-first layout. Every
+ * resolver applies a path-traversal guard plus a `verifyPathWithinBase`
+ * realpath check so symlinks planted in the bundle cannot escape the
+ * skill's directory.
  */
 
 import { constants, lstat, open } from 'node:fs/promises';
@@ -94,30 +94,25 @@ export function validateSkillSlug(slug: string): boolean {
   return true;
 }
 
-function getBaseDir(): string {
-  const dir = process.env.SKILLS_DIR;
-  if (dir) return dir;
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'skills');
+  if (configDir) return configDir;
   throw new Error(
-    'Neither TALE_CONFIG_DIR nor SKILLS_DIR environment variable is set. ' +
-      'Set TALE_CONFIG_DIR in .env to the root config directory ' +
+    'TALE_CONFIG_DIR environment variable is not set. ' +
+      'Set it to the root config directory ' +
       '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
   );
 }
 
 /**
- * Resolve the skills directory for an organization. Default org uses the
- * base directly; every other org lives under a `@<orgSlug>/` prefix —
- * matches the convention enforced by integrations and agents.
+ * Resolve the skills directory for an organization. Org-first:
+ * `${TALE_CONFIG_DIR}/<orgSlug>/skills/`.
  */
 export function resolveSkillsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') return baseDir;
-  return path.join(baseDir, `@${orgSlug}`);
+  return path.join(getConfigRoot(), orgSlug, 'skills');
 }
 
 export function resolveSkillDir(orgSlug: string, slug: string): string {
diff --git a/services/platform/convex/workflows/file_utils.ts b/services/platform/convex/workflows/file_utils.ts
index 2c97a61c7c..9bfcf3debd 100644
--- a/services/platform/convex/workflows/file_utils.ts
+++ b/services/platform/convex/workflows/file_utils.ts
@@ -75,32 +75,26 @@ export function urlParamToSlug(param: string): string {
   return param.replace(new RegExp(SLUG_SEPARATOR, 'g'), '/');
 }
 
-function getBaseDir(): string {
-  const dir = process.env.WORKFLOWS_DIR;
-  if (dir) return dir;
+function getConfigRoot(): string {
   const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return path.join(configDir, 'workflows');
+  if (configDir) return configDir;
   throw new Error(
-    'Neither TALE_CONFIG_DIR nor WORKFLOWS_DIR environment variable is set. ' +
-      'Set TALE_CONFIG_DIR in .env to the root config directory ' +
+    'TALE_CONFIG_DIR environment variable is not set. ' +
+      'Set it to the root config directory ' +
       '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
   );
 }
 
 /**
- * Resolve the workflows directory for an organization.
- * Default org uses the base dir directly.
- * Other orgs use `{baseDir}/@{orgSlug}/` to prevent collision with workflow folders.
+ * Resolve the workflows directory for an organization. Org-first:
+ * `${TALE_CONFIG_DIR}/<orgSlug>/workflows/`. No `@`-prefix collision concern
+ * here since workflow folders live inside the per-org subtree.
  */
 export function resolveWorkflowsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  const baseDir = getBaseDir();
-  if (orgSlug === 'default') {
-    return baseDir;
-  }
-  return path.join(baseDir, `@${orgSlug}`);
+  return path.join(getConfigRoot(), orgSlug, 'workflows');
 }
 
 /**
diff --git a/services/platform/docker-entrypoint.sh b/services/platform/docker-entrypoint.sh
index bcfe181bac..0ac3c78b6d 100644
--- a/services/platform/docker-entrypoint.sh
+++ b/services/platform/docker-entrypoint.sh
@@ -227,30 +227,25 @@ deploy_convex_functions() {
     CONVEX_ENV_MAP["$key"]="${line#*=}"
   done <<< "$CONVEX_ENV_OUTPUT"
 
-  # One-shot cleanup: remove env vars that earlier Tale versions auto-pushed
-  # but the current architecture derives from TALE_CONFIG_DIR.
-  #
-  # Safety: only remove the var if its current value matches the auto-derived
-  # path (i.e. it's a stale auto-push, not an operator's custom override).
-  # An override like AGENTS_DIR=/data/custom-agents is preserved untouched.
-  local config_dir="${TALE_CONFIG_DIR:-/app/data}"
-  local -A ORPHAN_DERIVED=(
-    [AGENTS_DIR]="${config_dir}/agents"
-    [WORKFLOWS_DIR]="${config_dir}/workflows"
-    [INTEGRATIONS_DIR]="${config_dir}/integrations"
-    [PROVIDERS_DIR]="${config_dir}/providers"
+  # Unconditional purge: the per-domain env overrides (AGENTS_DIR /
+  # WORKFLOWS_DIR / INTEGRATIONS_DIR / PROVIDERS_DIR / SKILLS_DIR) are no
+  # longer honored by the resolvers under the uniform org-first layout.
+  # Remove them from the Convex deployment env on every boot, regardless
+  # of whether they look auto-derived or operator-customized. Operators
+  # who previously relied on a custom value must now point TALE_CONFIG_DIR
+  # at the root and use the `<orgSlug>/<domain>/` subtree.
+  local -a LEGACY_DOMAIN_VARS=(
+    AGENTS_DIR
+    WORKFLOWS_DIR
+    INTEGRATIONS_DIR
+    PROVIDERS_DIR
+    SKILLS_DIR
   )
-  for orphan in "${!ORPHAN_DERIVED[@]}"; do
-    if [ "${CONVEX_ENV_MAP[$orphan]+_}" ]; then
-      local current="${CONVEX_ENV_MAP[$orphan]}"
-      local derived="${ORPHAN_DERIVED[$orphan]}"
-      if [ "$current" = "$derived" ]; then
-        if bunx convex env remove "$orphan" --url "$CONVEX_URL" --admin-key "$ADMIN_KEY" >/dev/null 2>&1; then
-          echo "   ✓ $orphan (orphan removed — derived from TALE_CONFIG_DIR)"
-          unset 'CONVEX_ENV_MAP[$orphan]'
-        fi
-      else
-        log_info "$orphan=$current preserved (custom override; not the derived $derived)"
+  for legacy in "${LEGACY_DOMAIN_VARS[@]}"; do
+    if [ "${CONVEX_ENV_MAP[$legacy]+_}" ]; then
+      if bunx convex env remove "$legacy" --url "$CONVEX_URL" --admin-key "$ADMIN_KEY" >/dev/null 2>&1; then
+        echo "   ✓ $legacy removed (no longer honored under org-first layout)"
+        unset 'CONVEX_ENV_MAP[$legacy]'
       fi
     fi
   done
diff --git a/services/platform/env.sh b/services/platform/env.sh
index 107e8af686..94972118f9 100644
--- a/services/platform/env.sh
+++ b/services/platform/env.sh
@@ -55,9 +55,12 @@ env_normalize_common() {
 	  export INSTANCE_NAME="tale_platform"
 	  export INSTANCE_SECRET="${INSTANCE_SECRET}"
 
-  # Root config directory. Sub-dirs (agents/workflows/integrations/providers)
-  # are derived inside Convex via `convex/*/file_utils.ts` — no need to set
-  # AGENTS_DIR / WORKFLOWS_DIR / INTEGRATIONS_DIR / PROVIDERS_DIR explicitly.
+  # Root config directory. Per-org subtrees live at $TALE_CONFIG_DIR/<orgSlug>/
+  # with one subdir per domain (agents/, workflows/, providers/, etc.).
+  # Per-domain env overrides (AGENTS_DIR / WORKFLOWS_DIR / INTEGRATIONS_DIR /
+  # PROVIDERS_DIR / SKILLS_DIR) are no longer honored — set TALE_CONFIG_DIR
+  # only. The entrypoint purges those legacy vars from the Convex deployment
+  # env on every boot.
   export TALE_CONFIG_DIR="${TALE_CONFIG_DIR:-/app/data}"
 
   # Site URL - the canonical base URL for the platform (required)
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index 9851a3122c..1ee647ed3a 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -15,27 +15,37 @@ interface ConfigChangeEvent {
 }
 
 const ATOMIC_WRITE_TMP_RE = /\.\d+\.[a-f0-9]{8}\.tmp$/;
+// Must match validateOrgSlug at services/platform/convex/lib/file_io.ts.
+const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
 
 /**
- * Parse a relative path within the config directory into a structured event.
+ * Parse a relative path within the config directory into a structured event,
+ * under the uniform org-first layout `${TALE_CONFIG_DIR}/<orgSlug>/<domain>/...`.
  *
- * Examples:
- *   agents/my-agent.json           → { type: 'agent', slug: 'my-agent' }
- *   agents/@acme/my-agent.json     → { type: 'agent', orgSlug: 'acme', slug: 'my-agent' }
- *   workflows/general/hello.json   → { type: 'workflow', slug: 'general/hello' }
- *   workflows/@acme/hello.json     → { type: 'workflow', orgSlug: 'acme', slug: 'hello' }
- *   integrations/slack/config.json → { type: 'integration', slug: 'slack' }
- *   integrations/@acme/slack/config.json → { type: 'integration', orgSlug: 'acme', slug: 'slack' }
- *   branding/branding.json         → { type: 'branding' }
+ * Examples (with `default` as one possible orgSlug):
+ *   default/agents/my-agent.json           → { type: 'agents', orgSlug: 'default', slug: 'my-agent' }
+ *   acme/agents/my-agent.json              → { type: 'agents', orgSlug: 'acme', slug: 'my-agent' }
+ *   default/workflows/general/hello.json   → { type: 'workflows', orgSlug: 'default', slug: 'general/hello' }
+ *   default/integrations/slack/config.json → { type: 'integrations', orgSlug: 'default', slug: 'slack' }
+ *   default/branding/branding.json         → { type: 'branding', orgSlug: 'default' }
+ *   default/skills/code-reviewer/SKILL.md  → { type: 'skills', orgSlug: 'default', slug: 'code-reviewer' }
+ *
+ * Returns null for paths that don't fit the `<org>/<domain>/<rest>` shape
+ * (org slug must validate; domain must be recognized).
  */
 function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   const parts = relativePath.split('/');
   if (parts.length < 2) return null;
 
-  const topDir = parts[0];
+  const orgSlug = parts[0];
+  if (!ORG_SLUG_REGEX.test(orgSlug)) return null;
+
+  const domain = parts[1];
 
-  if (topDir === 'branding') {
-    return { type: 'branding' };
+  if (domain === 'branding') {
+    // Branding is default-only on the read side, but still emit per-org so
+    // future per-org branding (or operator inspection) sees the event.
+    return { type: 'branding', orgSlug };
   }
 
   const typeMap: Record<string, ConfigChangeEvent['type']> = {
@@ -46,46 +56,38 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
     skills: 'skills',
   };
 
-  const type = typeMap[topDir];
+  const type = typeMap[domain];
   if (!type) return null;
 
-  const rest = parts.slice(1);
-  let orgSlug: string | undefined;
-
-  // If the first segment after the top dir starts with @, it's an org slug
-  if (rest[0]?.startsWith('@')) {
-    orgSlug = rest[0].slice(1);
-    rest.shift();
-  }
-
+  const rest = parts.slice(2);
   if (rest.length === 0) return null;
 
   if (type === 'agents') {
-    // agents/[@org/]name.json
+    // <org>/agents/<name>.json
     const filename = rest[0];
     return { type, orgSlug, slug: filename.replace(/\.json$/, '') };
   }
 
   if (type === 'workflows') {
-    // workflows/[@org/][folder/]name.json — slug is the path without extension
+    // <org>/workflows/[folder/]name.json — slug is the path without extension
     const slug = rest.join('/').replace(/\.json$/, '');
     return { type, orgSlug, slug };
   }
 
   if (type === 'integrations') {
-    // integrations/[@org/]slug/config.json
+    // <org>/integrations/<slug>/config.json (or other bundle files)
     const slug = rest[0];
     return { type, orgSlug, slug };
   }
 
   if (type === 'providers') {
-    // providers/[@org/]name.json
+    // <org>/providers/<name>.json
     const filename = rest[0];
     return { type, orgSlug, slug: filename.replace(/\.json$/, '') };
   }
 
   if (type === 'skills') {
-    // skills/[@org/]slug/SKILL.md (or any asset under the slug dir).
+    // <org>/skills/<slug>/SKILL.md (or any asset under the slug dir).
     // Emit at slug granularity so a write to scripts/x.py invalidates the
     // same query keys as a SKILL.md write.
     const slug = rest[0];
diff --git a/services/platform/lib/shared/schemas/governance.ts b/services/platform/lib/shared/schemas/governance.ts
index dc4ab1a2d6..6368055ed8 100644
--- a/services/platform/lib/shared/schemas/governance.ts
+++ b/services/platform/lib/shared/schemas/governance.ts
@@ -161,7 +161,7 @@ export type UploadPolicyConfig = z.infer<typeof uploadPolicyConfigSchema>;
 /**
  * Per-org retention policy payload. Schema only validates structural
  * shape (integer + non-negative); category min/max bounds live in
- * `examples/retention/default.json` (or per-org override files) and are
+ * `examples/default/retention.json` (or per-org override files) and are
  * enforced at write time by `assertWithinBounds` inside
  * `upsertRetentionPolicyAction`. Operators tighten or rename bounds by
  * editing the JSON file; the schema does not duplicate them.
diff --git a/services/platform/lib/shared/schemas/retention.test.ts b/services/platform/lib/shared/schemas/retention.test.ts
index 9d85f28c97..d4eed59ebd 100644
--- a/services/platform/lib/shared/schemas/retention.test.ts
+++ b/services/platform/lib/shared/schemas/retention.test.ts
@@ -96,7 +96,7 @@ describe('retentionBoundDefSchema', () => {
 });
 
 describe('retentionDefaultsConfigSchema', () => {
-  it('accepts examples/retention/default.json (every category + root envPrefix + full envNames map)', () => {
+  it('accepts examples/default/retention.json (every category + root envPrefix + full envNames map)', () => {
     // Resolve from this test's directory up to repo root, then to examples/.
     // __dirname is services/platform/lib/shared/schemas/
     const examplePath = join(
@@ -107,8 +107,8 @@ describe('retentionDefaultsConfigSchema', () => {
       '..',
       '..',
       'examples',
-      'retention',
-      'default.json',
+      'default',
+      'retention.json',
     );
     const content = readFileSync(examplePath, 'utf-8');
     const parsed = JSON.parse(content);
@@ -121,7 +121,7 @@ describe('retentionDefaultsConfigSchema', () => {
     // Strict drift check: factory file declares every category and the
     // root `_metadata.envNames` map covers every (category × field)
     // pair (16 × 3 = 48 entries). Adding a new category to
-    // RETENTION_CATEGORIES without updating examples/retention/default.json
+    // RETENTION_CATEGORIES without updating examples/default/retention.json
     // fails one of these assertions loudly.
     expect(typeof parsed._metadata?.envPrefix).toBe('string');
     expect(parsed._metadata.envPrefix.length).toBeGreaterThan(0);
diff --git a/services/platform/lib/shared/utils/example-agents-normalized.test.ts b/services/platform/lib/shared/utils/example-agents-normalized.test.ts
index 245ea9cdd0..8234270dad 100644
--- a/services/platform/lib/shared/utils/example-agents-normalized.test.ts
+++ b/services/platform/lib/shared/utils/example-agents-normalized.test.ts
@@ -7,19 +7,23 @@ import type { AgentJsonConfig } from '../../../convex/agents/file_utils';
 import { isNormalized, normalizeAgentConfig } from './normalize-agent-config';
 
 /**
- * Every agent JSON in `examples/agents/` is treated as part of the shipped
- * product — new orgs scaffold their agent directory by copying these files
- * via `scaffoldNewOrganization`, which goes around the `normalizeAgentConfig`
- * write boundary. If an example ever drifts into a non-normalized shape
- * (legacy top-level co-existing with i18n[defaultLocale], empty-string
- * placeholders, etc.), new orgs will inherit the pollution on creation.
+ * Every agent JSON in `examples/default/agents/` is treated as part of the
+ * shipped product — new orgs scaffold their agent directory by copying these
+ * files via `scaffoldNewOrganization`, which goes around the
+ * `normalizeAgentConfig` write boundary. If an example ever drifts into a
+ * non-normalized shape (legacy top-level co-existing with i18n[defaultLocale],
+ * empty-string placeholders, etc.), new orgs will inherit the pollution on
+ * creation.
  *
  * This test pins the invariant at build time so any drift fails CI.
  */
 
-const EXAMPLES_DIR = path.resolve(__dirname, '../../../../../examples/agents');
+const EXAMPLES_DIR = path.resolve(
+  __dirname,
+  '../../../../../examples/default/agents',
+);
 
-describe('examples/agents/*.json invariants', () => {
+describe('examples/default/agents/*.json invariants', () => {
   const files = readdirSync(EXAMPLES_DIR).filter((f) => f.endsWith('.json'));
 
   it('discovered at least one example agent', () => {
diff --git a/services/platform/server.ts b/services/platform/server.ts
index 3e02354808..a02df47df1 100644
--- a/services/platform/server.ts
+++ b/services/platform/server.ts
@@ -83,8 +83,11 @@ const port = process.env.PORT || 3000;
 const moduleDir = dirname(fileURLToPath(import.meta.url));
 const distDir = join(moduleDir, 'dist');
 const distSeoDir = join(moduleDir, 'dist-seo');
+// Branding is default-only on the read side (see branding/file_actions.ts —
+// every reader passes the literal 'default'). On-disk location follows the
+// uniform org-first layout: `${TALE_CONFIG_DIR}/default/branding/images/`.
 const brandingImagesDir = process.env.TALE_CONFIG_DIR
-  ? join(process.env.TALE_CONFIG_DIR, 'branding', 'images')
+  ? join(process.env.TALE_CONFIG_DIR, 'default', 'branding', 'images')
   : null;
 
 // Lazily loaded once per process. The manifest is read on the first
diff --git a/services/platform/vite-plugins/serve-branding-images.ts b/services/platform/vite-plugins/serve-branding-images.ts
index 9d74f69413..e05fe95e4d 100644
--- a/services/platform/vite-plugins/serve-branding-images.ts
+++ b/services/platform/vite-plugins/serve-branding-images.ts
@@ -14,8 +14,12 @@ const MIME_TYPES: Record<string, string> = {
 };
 
 export function serveBrandingImages(): Plugin {
+  // Branding is default-only on the read side (see branding/file_actions.ts).
+  // On-disk location: `${TALE_CONFIG_DIR}/default/branding/images/`.
   const configDir = process.env.TALE_CONFIG_DIR;
-  const imagesDir = configDir ? join(configDir, 'branding', 'images') : null;
+  const imagesDir = configDir
+    ? join(configDir, 'default', 'branding', 'images')
+    : null;
 
   return {
     name: 'serve-branding-images',
diff --git a/tools/cli/src/commands/deploy/index.ts b/tools/cli/src/commands/deploy/index.ts
index 4fe3f59c49..c7d9816dd5 100644
--- a/tools/cli/src/commands/deploy/index.ts
+++ b/tools/cli/src/commands/deploy/index.ts
@@ -34,13 +34,15 @@ export function createDeployCommand(): Command {
     )
     .option('-q, --quiet', 'Suppress container logs during deployment')
     .option(
-      '-y, --yes',
-      'Non-interactive: automatically accept any pending migrations',
+      '--override-all',
+      'After deploy, factory-reseed the builtin catalog into ALL orgs server-side ' +
+        '(preserves *.secrets.json, .history/, and uploaded branding/images/). ' +
+        'Implies --all (recreates stateful services so the new entrypoint runs).',
       false,
     )
     .option(
-      '--migrate-volumes',
-      '[deprecated] alias for --yes; will be removed in a future release',
+      '-y, --yes',
+      'Non-interactive: auto-accept destructive confirmation prompts (e.g. --override-all)',
       false,
     )
     .action(async (options) => {
@@ -84,22 +86,20 @@ export function createDeployCommand(): Command {
           services = serviceList as ServiceName[];
         }
 
-        if (options.migrateVolumes && !options.yes) {
-          logger.warn(
-            '--migrate-volumes is deprecated; use --yes for non-interactive migration acceptance.',
-          );
-        }
         const hostAlias = options.host ?? process.env.HOST ?? 'tale.local';
         await deploy({
           version,
-          updateStateful: options.all,
+          // --override-all implies --all so the convex container restarts
+          // with the new entrypoint + new code before the reseed action runs.
+          updateStateful: options.all || options.overrideAll,
           env,
           hostAlias,
           dryRun: options.dryRun,
           services,
           override: options.override,
+          overrideAll: options.overrideAll,
           quiet: options.quiet,
-          assumeYes: options.yes || options.migrateVolumes,
+          assumeYes: options.yes,
           forceRecreate,
         });
       } catch (err) {
diff --git a/tools/cli/src/commands/migrate.ts b/tools/cli/src/commands/migrate.ts
new file mode 100644
index 0000000000..d2e3b8cbde
--- /dev/null
+++ b/tools/cli/src/commands/migrate.ts
@@ -0,0 +1,41 @@
+import { Command } from 'commander';
+
+import { migrateConfigLayout } from '../lib/actions/migrate-config-layout';
+import { requireProject } from '../lib/project/find-project';
+import { resolveProjectContext } from '../lib/project/project-context';
+import * as logger from '../utils/logger';
+
+export function createMigrateCommand(): Command {
+  const migrateCmd = new Command('migrate').description(
+    'One-shot, manually-run config migrations',
+  );
+
+  migrateCmd
+    .command('config-layout')
+    .description(
+      'Relocate providers/*.secrets.json from the legacy per-domain layout ' +
+        'to the org-first layout. Idempotent; copies (not moves) so old paths ' +
+        'remain readable until --cleanup-old runs.',
+    )
+    .option('--dry-run', 'Preview moves without changing files', false)
+    .option(
+      '--cleanup-old',
+      'After verifying new == old (sha256), remove the old-path secrets. ' +
+        'Run only after the new deployment is healthy.',
+      false,
+    )
+    .action(async (opts: { dryRun?: boolean; cleanupOld?: boolean }) => {
+      try {
+        await resolveProjectContext(requireProject());
+        await migrateConfigLayout({
+          dryRun: opts.dryRun ?? false,
+          cleanupOld: opts.cleanupOld ?? false,
+        });
+      } catch (err) {
+        logger.error(err instanceof Error ? err.message : String(err));
+        process.exit(1);
+      }
+    });
+
+  return migrateCmd;
+}
diff --git a/tools/cli/src/commands/start/index.ts b/tools/cli/src/commands/start/index.ts
index 636413b154..6b4bc54efa 100644
--- a/tools/cli/src/commands/start/index.ts
+++ b/tools/cli/src/commands/start/index.ts
@@ -1,36 +1,47 @@
-import { Command } from 'commander';
+import { Command, Option } from 'commander';
 
 import { start } from '../../lib/actions/start';
 import * as logger from '../../utils/logger';
 
 export function createStartCommand(): Command {
-  return new Command('start')
-    .description('Start Tale platform locally with project files')
-    .option('-d, --detach', 'run in background')
-    .option('-p, --port <port>', 'HTTPS port to expose', '443')
-    .option('--host <hostname>', 'host alias for proxy', 'tale.local')
-    .option(
-      '-y, --yes',
-      'automatically accept any pending migrations (non-interactive; required in CI/non-TTY)',
-    )
-    .action(
-      async (opts: {
-        detach?: boolean;
-        port: string;
-        host: string;
-        yes?: boolean;
-      }) => {
-        try {
-          await start({
-            detach: opts.detach,
-            port: Number(opts.port),
-            host: opts.host,
-            assumeYes: opts.yes,
-          });
-        } catch (err) {
-          logger.error(err instanceof Error ? err.message : String(err));
-          process.exit(1);
-        }
-      },
-    );
+  return (
+    new Command('start')
+      .description('Start Tale platform locally with project files')
+      .option('-d, --detach', 'run in background')
+      .option('-p, --port <port>', 'HTTPS port to expose', '443')
+      .option('--host <hostname>', 'host alias for proxy', 'tale.local')
+      // Hidden back-compat: `tale start -y` used to skip migration prompts.
+      // The auto-migration framework is gone but operator CI scripts may
+      // still pass `-y`. Accept and ignore for one release, then remove.
+      .addOption(
+        new Option(
+          '-y, --yes',
+          '[deprecated] no longer needed (auto-migrations removed); ignored',
+        ).hideHelp(),
+      )
+      .action(
+        async (opts: {
+          detach?: boolean;
+          port: string;
+          host: string;
+          yes?: boolean;
+        }) => {
+          try {
+            if (opts.yes) {
+              logger.warn(
+                '--yes/-y is deprecated on `tale start` and ignored; safe to remove from scripts.',
+              );
+            }
+            await start({
+              detach: opts.detach,
+              port: Number(opts.port),
+              host: opts.host,
+            });
+          } catch (err) {
+            logger.error(err instanceof Error ? err.message : String(err));
+            process.exit(1);
+          }
+        },
+      )
+  );
 }
diff --git a/tools/cli/src/index.ts b/tools/cli/src/index.ts
index c5d1d16c86..b396d70f55 100644
--- a/tools/cli/src/index.ts
+++ b/tools/cli/src/index.ts
@@ -10,6 +10,7 @@ import { createDeployCommand } from './commands/deploy';
 import { createDoctorCommand } from './commands/doctor';
 import { createInitCommand } from './commands/init';
 import { createLogsCommand } from './commands/logs';
+import { createMigrateCommand } from './commands/migrate';
 import { createResetCommand } from './commands/reset';
 import { createRollbackCommand } from './commands/rollback';
 import { createStartCommand } from './commands/start';
@@ -47,5 +48,6 @@ program.addCommand(createRollbackCommand());
 program.addCommand(createResetCommand());
 program.addCommand(createCleanupCommand());
 program.addCommand(createDoctorCommand());
+program.addCommand(createMigrateCommand());
 
 await program.parseAsync();
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 5ce54c0144..6f235fe576 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -1,5 +1,5 @@
-import { existsSync } from 'node:fs';
-import { cp, mkdtemp, rm } from 'node:fs/promises';
+import { lstatSync } from 'node:fs';
+import { cp, mkdtemp, readdir, rm } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 
@@ -34,8 +34,7 @@ import { getNextColor } from '../state/get-next-color';
 import { setCurrentColor } from '../state/set-current-color';
 import { setPreviousVersion } from '../state/set-previous-version';
 import { withLock } from '../state/with-lock';
-import { MIGRATIONS } from '../upgrade/registry';
-import { runPendingMigrations } from '../upgrade/runner';
+import { reseedAllOrgsFromBuiltin } from './reseed-all-orgs';
 
 async function ensureInfrastructure(
   prefix: string,
@@ -73,11 +72,16 @@ interface DeployOptions {
   dryRun: boolean;
   services?: ServiceName[];
   override?: boolean;
+  /**
+   * Factory-reseed builtin → all orgs after deploy completes. Triggers a
+   * server-side reseed action; preserves *.secrets.json, .history/, and
+   * uploaded branding/images/. Combined with `override`, host-push runs
+   * first, then the all-orgs reseed.
+   */
+  overrideAll?: boolean;
   quiet?: boolean;
-  /** Non-interactive acceptance of any pending migrations. */
+  /** Non-interactive: accept destructive confirmation prompts (e.g. --override-all). */
   assumeYes?: boolean;
-  /** @deprecated use assumeYes. Kept for one release of CLI back-compat. */
-  migrateVolumes?: boolean;
   /**
    * Set by the caller when `ensureEnv` filled in auto-gen secrets headlessly
    * (e.g. an upgrade silently materialized `SANDBOX_TOKEN`). All subsequent
@@ -141,56 +145,8 @@ export async function deploy(options: DeployOptions): Promise<void> {
       const prefix = dryRun ? '[DRY-RUN] ' : '';
       logger.header(`${prefix}Deploying Tale ${version}`);
 
-      // Detect and apply any pending migrations before deploying. The runner
-      // prints the plan and prompts the user (default No) when anything is
-      // pending; non-interactive callers must pass --yes (aliased from the
-      // deprecated --migrate-volumes). Declining aborts deploy cleanly.
-      {
-        const migrationResult = await runPendingMigrations(
-          MIGRATIONS,
-          { projectId: getProjectId(), projectDir: env.DEPLOY_DIR },
-          {
-            context: 'deploy',
-            assumeYes: options.assumeYes ?? options.migrateVolumes,
-            dryRun,
-            async performStops(stops) {
-              // `stops` may contain compose project names (e.g. 'tale',
-              // 'tale-blue') and/or individual container names (e.g.
-              // '${projectId}-platform-blue'). Try each as a compose project
-              // first, fall back to plain `docker stop`. Failures MUST
-              // surface — a silently-swallowed stop can let the migration
-              // copy a live volume, corrupting data.
-              for (const name of stops) {
-                const composeDown = await exec(
-                  'docker',
-                  ['compose', '-p', name, 'down', '--remove-orphans'],
-                  { silent: true },
-                );
-                if (composeDown.success) continue;
-                const stopResult = await exec(
-                  'docker',
-                  ['stop', '-t', '30', name],
-                  { silent: true },
-                );
-                if (stopResult.success) continue;
-                const stderr = `${stopResult.stderr ?? ''}`.toLowerCase();
-                const looksMissing =
-                  stderr.includes('no such container') ||
-                  stderr.includes('not found');
-                if (!looksMissing) {
-                  throw new Error(
-                    `Failed to stop '${name}' before migration: ${stopResult.stderr?.trim() || 'unknown error'}`,
-                  );
-                }
-              }
-            },
-          },
-        );
-        if (!migrationResult.proceed) {
-          logger.info('Aborting deploy until migrations are approved.');
-          return;
-        }
-      }
+      // (Auto-migration framework removed — `tale migrate config-layout` is
+      // the only opt-in, manually-run migration now.)
 
       // Check if this is a first-time deployment
       const currentColor = await getCurrentColor(env.DEPLOY_DIR);
@@ -628,6 +584,16 @@ export async function deploy(options: DeployOptions): Promise<void> {
         tempStageDirs,
         options.override ?? false,
       );
+
+      // After deploy + optional host-push, trigger server-side reseed of
+      // builtin catalog into every org. Runs against the platform container
+      // (which holds the convex function source + admin key derivation).
+      if (options.overrideAll) {
+        await reseedAllOrgsFromBuiltin({
+          dryRun,
+          assumeYes: options.assumeYes ?? false,
+        });
+      }
     });
   } finally {
     process.removeListener('SIGINT', onInterrupt);
@@ -635,20 +601,60 @@ export async function deploy(options: DeployOptions): Promise<void> {
   }
 }
 
-// Host workspace dirs that `tale deploy --override` pushes into the convex
-// container. `*.secrets.json` files and `.history/` directories are always
-// excluded from the push (see `stageForOverride`): encrypted secrets cannot be
-// re-derived from the host, and the container's UI edit-history trail must
-// survive. `docker cp` is additive, so anything not staged is left untouched
-// on the container side.
-const SYNC_DIRS = [
+// Org slug shape — must match validateOrgSlug at services/platform/convex/lib/file_io.ts.
+// Duplicated here because the CLI ships in a single compiled binary that does
+// not import convex sources at runtime.
+const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+const MAX_ORG_SLUG_LENGTH = 64;
+
+// Top-level names under the project root that are legitimate per-domain
+// dirs from the OLD flat layout (`agents/`, `workflows/`, …). Under
+// org-first these don't belong at the root anymore — if any are present
+// it's a legacy project that hasn't been re-init'd. Refuse to push (would
+// silently land in `/app/data/agents/` etc., which the new resolvers don't
+// read) and point the operator at `tale init --force`.
+const LEGACY_DOMAIN_DIR_NAMES = new Set([
   'agents',
   'workflows',
   'integrations',
   'branding',
   'providers',
   'skills',
-];
+  'retention',
+]);
+
+function isValidOrgSlug(name: string): boolean {
+  return (
+    name.length > 0 &&
+    name.length <= MAX_ORG_SLUG_LENGTH &&
+    ORG_SLUG_REGEX.test(name)
+  );
+}
+
+async function findOrgDirs(
+  projectDir: string,
+): Promise<{ orgDirs: string[]; legacyDirs: string[] }> {
+  const orgDirs: string[] = [];
+  const legacyDirs: string[] = [];
+  let entries: import('node:fs').Dirent[];
+  try {
+    entries = await readdir(projectDir, { withFileTypes: true });
+  } catch {
+    return { orgDirs, legacyDirs };
+  }
+  for (const entry of entries) {
+    if (!entry.isDirectory()) continue;
+    const name = entry.name;
+    if (name.startsWith('.')) continue; // skips .tale, .git, .vscode, .DS_Store etc.
+    if (LEGACY_DOMAIN_DIR_NAMES.has(name)) {
+      legacyDirs.push(name);
+      continue;
+    }
+    if (!isValidOrgSlug(name)) continue;
+    orgDirs.push(name);
+  }
+  return { orgDirs, legacyDirs };
+}
 
 async function syncProjectFiles(
   containerName: string,
@@ -672,11 +678,24 @@ async function syncProjectFiles(
     return;
   }
 
-  const dirsToSync = SYNC_DIRS.filter((dir) =>
-    existsSync(join(projectDir, dir)),
-  );
+  const { orgDirs, legacyDirs } = await findOrgDirs(projectDir);
+
+  if (legacyDirs.length > 0) {
+    logger.error(
+      `${prefix}Legacy flat layout detected at project root (${legacyDirs.join(', ')}/).`,
+    );
+    logger.info(
+      `${prefix}  Move config under 'default/<domain>/' (or run 'tale init --force' to rescaffold).`,
+    );
+    logger.info(`${prefix}  Aborting --override push.`);
+    return;
+  }
 
-  if (dirsToSync.length === 0) {
+  if (orgDirs.length === 0) {
+    logger.blank();
+    logger.info(
+      `${prefix}Nothing to push: no org directories found at host root (expected e.g. 'default/').`,
+    );
     return;
   }
 
@@ -691,86 +710,115 @@ async function syncProjectFiles(
   }
 
   logger.blank();
-  logger.step(`${prefix}Overriding container config from host workspace...`);
+  logger.step(
+    `${prefix}Overriding container config from host workspace (1:1 push)...`,
+  );
   logger.info(
     `${prefix}  (encrypted *.secrets.json and .history/ are always preserved)`,
   );
+  logger.info(
+    `${prefix}  (--override is an additive overlay; files deleted locally remain in the container — use --override-all to factory-reseed from builtin)`,
+  );
 
-  for (const dir of dirsToSync) {
-    const srcPath = join(projectDir, dir);
+  // Stage the full set of org subtrees into a single tmp dir whose top-level
+  // mirrors the in-container `/app/data/` shape: `<stage>/<org>/<domain>/...`.
+  // Then a single `docker cp <stage>/. <container>:/app/data/` does the push.
+  // Root-level junk (`tale.json`, `.tale/`, `.env`, `.git/`, IDE configs, etc.)
+  // is excluded by allowlist — never staged, never shipped.
+  const stageDir = await mkdtemp(join(tmpdir(), 'tale-sync-'));
+  tempStageDirs.add(stageDir);
+
+  try {
+    for (const orgName of orgDirs) {
+      const orgSrc = join(projectDir, orgName);
+      const orgDst = join(stageDir, orgName);
+
+      if (dryRun) {
+        logger.info(
+          `${prefix}Would push ${orgName}/ → ${containerName}:/app/data/${orgName}/ (excluding *.secrets.json, .history/, symlinks)`,
+        );
+        continue;
+      }
+
+      await stageOrgIntoDir(orgSrc, orgDst);
+    }
 
     if (dryRun) {
       logger.info(
-        `${prefix}Would override ${dir}/ → ${containerName}:/app/data/${dir}/ (excluding *.secrets.json and .history/)`,
+        `${prefix}Skipped at root: tale.json, .tale/, .env, .git/, dotfiles, ${legacyDirs.length ? `legacy ${legacyDirs.join(', ')}/, ` : ''}any other non-org-shaped entries`,
       );
-      continue;
+      return;
     }
 
-    const stageDir = await stageForOverride(srcPath, tempStageDirs);
-
-    try {
-      const dockerSrcPath = stageDir.replaceAll('\\', '/');
-      const result = await exec('docker', [
-        'cp',
-        `${dockerSrcPath}/.`,
-        `${containerName}:/app/data/${dir}/`,
-      ]);
-
-      if (result.success) {
-        // docker cp copies files as root — fix ownership so the app user can write
-        const chownResult = await exec('docker', [
-          'exec',
-          containerName,
-          'chown',
-          '-R',
-          'app:app',
-          `/app/data/${dir}/`,
-        ]);
-        if (!chownResult.success) {
-          logger.warn(
-            `Failed to fix ownership for ${dir}/: ${chownResult.stderr}`,
-          );
-        }
-        logger.info(`Overrode ${dir}/`);
-      } else {
-        logger.warn(`Failed to override ${dir}/: ${result.stderr}`);
-      }
-    } finally {
-      tempStageDirs.delete(stageDir);
-      await rm(stageDir, { recursive: true, force: true });
+    const dockerSrcPath = stageDir.replaceAll('\\', '/');
+    const result = await exec('docker', [
+      'cp',
+      `${dockerSrcPath}/.`,
+      `${containerName}:/app/data/`,
+    ]);
+
+    if (!result.success) {
+      logger.error(`Failed to override config: ${result.stderr}`);
+      return;
     }
-  }
 
-  if (!dryRun) {
-    logger.success('Config override complete');
+    // docker cp copies files as root — fix ownership so the app user can write
+    const chownResult = await exec('docker', [
+      'exec',
+      containerName,
+      'chown',
+      '-R',
+      'app:app',
+      `/app/data/`,
+    ]);
+    if (!chownResult.success) {
+      logger.warn(
+        `Failed to fix ownership on /app/data: ${chownResult.stderr}`,
+      );
+    }
+
+    logger.success(
+      `Overrode ${orgDirs.length} org${orgDirs.length === 1 ? '' : 's'}: ${orgDirs.join(', ')}`,
+    );
+  } finally {
+    tempStageDirs.delete(stageDir);
+    await rm(stageDir, { recursive: true, force: true });
   }
 }
 
-// Copy host `<dir>/` into a fresh tmp dir, excluding `*.secrets.json` files
-// and any `.history/` directory during the copy. The staging dir is what
-// `docker cp` ships; since `docker cp` is additive (never deletes container
-// files absent from the source), excluded paths simply never reach the
-// container and its existing secrets / edit-history survive. fs.cp defaults to
-// dereference=false, which keeps symlinks intact. The `*.secrets.json` match
-// mirrors the entrypoint's seed-skip check (services/convex/docker-entrypoint.sh).
+// Copy a host org subtree (`<projectDir>/<orgName>/`) into a fresh
+// `<stageDir>/<orgName>/` while:
+//   - skipping `.history/` directories at any depth (UI edit-history trail
+//     must survive in the container; `docker cp` is additive so absent =
+//     preserved on the container side),
+//   - skipping `*.secrets.json` files at any depth (encrypted secrets
+//     cannot be re-derived from the host),
+//   - skipping symlinks (defense against operator's host workspace
+//     containing a symlink to /etc/passwd or similar; cp's filter receives
+//     the source path so we lstat it).
 //
-// Registers `stageDir` in `tempStageDirs` before any I/O so an interrupt or a
-// throw mid-copy still gets cleaned up by the caller / SIGINT handler.
-async function stageForOverride(
-  srcDir: string,
-  tempStageDirs: Set<string>,
-): Promise<string> {
-  const stageDir = await mkdtemp(join(tmpdir(), 'tale-sync-'));
-  tempStageDirs.add(stageDir);
-  await cp(srcDir, stageDir, {
+// All directory exclusions prune the entire subtree; `fs.cp` recurses past
+// the filter for any directory the filter returned `true` for. Root-level
+// non-org junk (`.tale/`, `.git/`, `.env`, IDE configs, dotfiles, etc.) is
+// excluded one level up — only org-shaped dirs from `findOrgDirs` reach
+// this function — so the filter here only handles depth-1+ skips.
+async function stageOrgIntoDir(srcDir: string, destDir: string): Promise<void> {
+  await cp(srcDir, destDir, {
     recursive: true,
     filter: (src) => {
       const base = src.split(/[\\/]/).pop() ?? '';
-      // Returning false for a directory prunes its entire subtree.
       if (base === '.history') return false;
       if (base.endsWith('.secrets.json')) return false;
+      // lstat is sync here because fs.cp's filter is sync. Symlinks at
+      // any depth are skipped; missing entries (ENOENT) also skip rather
+      // than throw — fs.cp re-races stat() so any race is benign.
+      try {
+        const info = lstatSync(src);
+        if (info.isSymbolicLink()) return false;
+      } catch {
+        return false;
+      }
       return true;
     },
   });
-  return stageDir;
 }
diff --git a/tools/cli/src/lib/actions/init.ts b/tools/cli/src/lib/actions/init.ts
index b209300215..4cffcbdc8c 100644
--- a/tools/cli/src/lib/actions/init.ts
+++ b/tools/cli/src/lib/actions/init.ts
@@ -19,8 +19,6 @@ import {
 } from '../project/types';
 import { writeProject } from '../project/write-project';
 import { generateAllRules } from '../rules/generators';
-import { MIGRATIONS } from '../upgrade/registry';
-import { writeMigrationsState } from '../upgrade/state';
 
 interface InitOptions {
   directory?: string;
@@ -31,7 +29,9 @@ interface InitOptions {
 const GITIGNORE_ENTRIES = [
   '.tale/',
   '.env',
-  '.history/',
+  // History dirs sit at any depth under the org-first tree
+  // (e.g. `default/agents/.history/<slug>/`); use a recursive glob.
+  '**/.history/',
   'compose.override.yml',
   'compose.override.yaml',
   // Provider API keys — SOPS-encrypted when SOPS_AGE_KEY is set, plaintext
@@ -133,26 +133,35 @@ export async function init(options: InitOptions): Promise<void> {
   await mkdir(join(target, '.tale'), { recursive: true });
   await fetchReference(target);
 
+  // Host workspace mirrors the uniform org-first layout: scaffold under
+  // `default/<domain>/...`. The default org is the canonical template;
+  // operators can add `<otherOrg>/<domain>/...` subtrees alongside and
+  // `tale deploy --override` will push each `<org>` it finds at root.
+  const defaultOrgDir = join(target, 'default');
+
   // Copy agents from embedded examples
   logger.step('Copying agent configurations...');
   const agentFiles = getEmbeddedExamples('agents');
-  await writeEmbeddedFiles(agentFiles, join(target, 'agents'));
+  await writeEmbeddedFiles(agentFiles, join(defaultOrgDir, 'agents'));
 
   // Copy workflows from embedded examples
   logger.step('Copying workflow configurations...');
   const workflowFiles = getEmbeddedExamples('workflows');
-  await writeEmbeddedFiles(workflowFiles, join(target, 'workflows'));
+  await writeEmbeddedFiles(workflowFiles, join(defaultOrgDir, 'workflows'));
 
   // Copy integrations from embedded examples
   logger.step('Copying integration configurations...');
   const integrationFiles = getEmbeddedExamples('integrations');
-  await writeEmbeddedFiles(integrationFiles, join(target, 'integrations'));
+  await writeEmbeddedFiles(
+    integrationFiles,
+    join(defaultOrgDir, 'integrations'),
+  );
 
   // Create branding directory with empty config
   logger.step('Creating branding configuration...');
-  await mkdir(join(target, 'branding', 'images'), { recursive: true });
-  await writeFile(join(target, 'branding', 'branding.json'), '{}\n');
-  await writeFile(join(target, 'branding', 'images', '.gitkeep'), '');
+  await mkdir(join(defaultOrgDir, 'branding', 'images'), { recursive: true });
+  await writeFile(join(defaultOrgDir, 'branding', 'branding.json'), '{}\n');
+  await writeFile(join(defaultOrgDir, 'branding', 'images', '.gitkeep'), '');
 
   // Copy provider configs (public JSON only, not encrypted secrets)
   logger.step('Copying provider configurations...');
@@ -163,33 +172,55 @@ export async function init(options: InitOptions): Promise<void> {
       providerConfigFiles.set(relPath, content);
     }
   }
-  await writeEmbeddedFiles(providerConfigFiles, join(target, 'providers'));
+  await writeEmbeddedFiles(
+    providerConfigFiles,
+    join(defaultOrgDir, 'providers'),
+  );
 
   // Copy skills from embedded examples
   logger.step('Copying skill bundles...');
   const skillFiles = getEmbeddedExamples('skills');
-  await writeEmbeddedFiles(skillFiles, join(target, 'skills'));
+  await writeEmbeddedFiles(skillFiles, join(defaultOrgDir, 'skills'));
 
-  // Compute checksums
+  // Compute checksums. Paths are recorded relative to the project root,
+  // matching where the files actually live (default/<domain>/...).
   logger.step('Computing file checksums...');
   const allFiles = new Map<string, string>();
 
   for (const [relPath, content] of agentFiles) {
-    allFiles.set(join('agents', relPath), computeContentHash(content));
+    allFiles.set(
+      join('default', 'agents', relPath),
+      computeContentHash(content),
+    );
   }
   for (const [relPath, content] of workflowFiles) {
-    allFiles.set(join('workflows', relPath), computeContentHash(content));
+    allFiles.set(
+      join('default', 'workflows', relPath),
+      computeContentHash(content),
+    );
   }
   for (const [relPath, content] of integrationFiles) {
-    allFiles.set(join('integrations', relPath), computeContentHash(content));
+    allFiles.set(
+      join('default', 'integrations', relPath),
+      computeContentHash(content),
+    );
   }
   for (const [relPath, content] of providerConfigFiles) {
-    allFiles.set(join('providers', relPath), computeContentHash(content));
+    allFiles.set(
+      join('default', 'providers', relPath),
+      computeContentHash(content),
+    );
   }
   for (const [relPath, content] of skillFiles) {
-    allFiles.set(join('skills', relPath), computeContentHash(content));
+    allFiles.set(
+      join('default', 'skills', relPath),
+      computeContentHash(content),
+    );
   }
-  allFiles.set(join('branding', 'branding.json'), computeContentHash('{}\n'));
+  allFiles.set(
+    join('default', 'branding', 'branding.json'),
+    computeContentHash('{}\n'),
+  );
 
   const checksums: Checksums = {
     cliVersion: pkg.version,
@@ -220,25 +251,9 @@ export async function init(options: InitOptions): Promise<void> {
   // Make the ID available to subsequent steps (ensureEnv uses getProjectId()).
   setProjectId(projectId);
 
-  // Seed `.tale/migrations.json` for fresh projects so historical migrations
-  // never apply to data that was born in the current CLI's schema. Without
-  // this, a `tale init` in a directory where the host still has legacy
-  // `tale_*` volumes from some older project would trigger namespace-volumes
-  // to copy that unrelated data into the new project's namespace.
-  //
-  // Only seed when there was no existing tale.json AND no migrations.json
-  // already present — reinit must not clobber prior state.
-  const migrationsJsonPath = join(target, '.tale', 'migrations.json');
-  if (existingProject === null && !existsSync(migrationsJsonPath)) {
-    const now = new Date().toISOString();
-    await writeMigrationsState(target, {
-      applied: MIGRATIONS.map((m) => ({
-        id: m.id,
-        at: now,
-        cliVersion: pkg.version,
-      })),
-    });
-  }
+  // (`.tale/migrations.json` seeding removed alongside the auto-migration
+  // framework. Existing projects' stale files are harmless and can be
+  // deleted manually.)
 
   // Write AI rules files
   logger.step('Writing AI rules files...');
@@ -266,7 +281,12 @@ export async function init(options: InitOptions): Promise<void> {
     // encrypted-vs-plaintext mode is a runtime save-path decision, not an
     // init-time choice.
     if (envResult.openrouterKey && envResult.agePublicKey) {
-      const secretsPath = join(target, 'providers', 'openrouter.secrets.json');
+      const secretsPath = join(
+        target,
+        'default',
+        'providers',
+        'openrouter.secrets.json',
+      );
       const { sopsEncryptJson } = await import('../crypto/sops-encrypt');
       const encrypted = await sopsEncryptJson(
         { apiKey: envResult.openrouterKey },
@@ -274,7 +294,7 @@ export async function init(options: InitOptions): Promise<void> {
       );
       await writeFile(secretsPath, encrypted);
       logger.success(
-        'Encrypted provider API key into providers/openrouter.secrets.json',
+        'Encrypted provider API key into default/providers/openrouter.secrets.json',
       );
     }
   }
@@ -301,7 +321,7 @@ export async function init(options: InitOptions): Promise<void> {
     logger.info(`  ${step++}. Run "cd ${target}" to enter your project`);
   }
   logger.info(
-    `  ${step++}. Edit agents/, workflows/, integrations/, skills/, and branding/ to customize your setup`,
+    `  ${step++}. Edit default/agents/, default/workflows/, default/integrations/, default/skills/, and default/branding/ to customize your setup`,
   );
   logger.info(
     `  ${step++}. Open the project in an AI-powered editor (Claude Code, Cursor, Copilot, or Windsurf) for guided config creation`,
@@ -309,15 +329,23 @@ export async function init(options: InitOptions): Promise<void> {
   logger.info(`  ${step++}. Run "tale start" to launch the platform locally`);
 }
 
+// Top-level markers indicating a Tale project. Under the uniform org-first
+// layout, `default/` is the canonical org dir (and any other org dir is
+// also a marker, but we don't try to enumerate slugs — `default/` is enough
+// to detect a project). Legacy per-domain dirs (`agents/`, `workflows/`,
+// etc.) at the root are kept as markers so `tale init` re-detects old
+// projects from a prior CLI version.
 const TALE_PROJECT_MARKERS = new Set([
   '.env',
   'tale.json',
+  '.tale',
+  'default',
+  // Legacy / pre-org-first markers (detected during reinit only):
   'providers',
   'agents',
   'workflows',
   'integrations',
   'skills',
-  '.tale',
   'branding',
 ]);
 
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
new file mode 100644
index 0000000000..67e209e78e
--- /dev/null
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -0,0 +1,104 @@
+/**
+ * `tale migrate config-layout` orchestration. Pipes the migrate-config-layout
+ * bash script into the currently-running convex container via stdin so the
+ * operator can run migrate FIRST (before redeploying with the new image).
+ *
+ * Uses cp (not mv) so old paths remain readable until the operator runs
+ * `tale migrate config-layout --cleanup-old` after verifying the new
+ * deployment is healthy. This is the rollback-insurance step.
+ *
+ * Runbook (2-step + optional cleanup):
+ *   1. tale migrate config-layout
+ *      (copies providers/*.secrets.json to new org-first paths;
+ *      old paths remain so the currently-running old code still works)
+ *   2. tale deploy --override-all -y
+ *      (recreates convex with new code + seeds non-default orgs from builtin)
+ *   3. (optional, after verifying health) tale migrate config-layout --cleanup-old
+ *      (sha-verifies new == old, then unlinks the olds)
+ */
+
+import { readFile } from 'node:fs/promises';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+import { getProjectId } from '../../utils/load-env';
+import * as logger from '../../utils/logger';
+import { exec } from '../docker/exec';
+import { isContainerRunning } from '../docker/is-container-running';
+
+export interface MigrateConfigLayoutOptions {
+  dryRun: boolean;
+  cleanupOld: boolean;
+}
+
+/**
+ * Read the migrate script next to this module. The .sh file is the source
+ * of truth (also runnable in the shell-script integration harness), and
+ * Bun's source-file colocation makes runtime loading work in both `bun
+ * run`-from-source and the compiled binary (Bun bundles imported assets).
+ */
+async function loadScript(): Promise<string> {
+  const moduleDir = dirname(fileURLToPath(import.meta.url));
+  const scriptPath = join(
+    moduleDir,
+    '..',
+    'migrate-config-layout',
+    'script.sh',
+  );
+  return await readFile(scriptPath, 'utf-8');
+}
+
+export async function migrateConfigLayout(
+  options: MigrateConfigLayoutOptions,
+): Promise<void> {
+  const { dryRun, cleanupOld } = options;
+
+  const containerName = `${getProjectId()}-convex`;
+  if (!(await isContainerRunning(containerName))) {
+    throw new Error(
+      `Convex container "${containerName}" is not running. ` +
+        'Start the platform first (e.g. `tale deploy`) before running this migration.',
+    );
+  }
+
+  const script = await loadScript();
+
+  const scriptArgs: string[] = [];
+  if (dryRun) scriptArgs.push('--dry-run');
+  if (cleanupOld) scriptArgs.push('--cleanup-old');
+
+  logger.blank();
+  if (cleanupOld) {
+    logger.step(
+      dryRun
+        ? '[DRY-RUN] Cleanup-old: would verify and remove old-path secrets'
+        : 'Verifying + removing old-path secrets (sha-matched against new paths)...',
+    );
+  } else {
+    logger.step(
+      dryRun
+        ? '[DRY-RUN] Migrate: would cp providers/*.secrets.json to new org-first paths'
+        : 'Copying providers/*.secrets.json to new org-first paths (old paths preserved for rollback)...',
+    );
+  }
+
+  // `docker exec -i ... bash -s -- <args>` runs the script piped via
+  // stdin; the `--` separates script args from bash's own flags.
+  const result = await exec(
+    'docker',
+    ['exec', '-i', containerName, 'bash', '-s', '--', ...scriptArgs],
+    { stdin: script },
+  );
+
+  if (result.stdout) logger.info(result.stdout);
+  if (!result.success) {
+    if (result.stderr) logger.error(result.stderr.trim());
+    throw new Error(
+      `tale migrate config-layout${cleanupOld ? ' --cleanup-old' : ''} failed (exit code ${result.exitCode}).`,
+    );
+  }
+  if (result.stderr) {
+    // Warnings printed to stderr (e.g. SKIP messages) are not fatal but worth surfacing.
+    logger.warn(result.stderr.trim());
+  }
+}
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
new file mode 100644
index 0000000000..170c819a19
--- /dev/null
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -0,0 +1,116 @@
+/**
+ * `tale deploy --override-all` orchestration: invoke the convex-side
+ * `reseedAllOrgsFromBuiltin` action via `docker exec` into the running
+ * platform container. Mirrors the proven incantation pattern from
+ * scripts/2026-03-28-migrate-convex-data.sh:120-131 (source env.sh,
+ * ensure_instance_secret, compute admin key inline, run convex CLI).
+ *
+ * Destructive: factory-reseeds every org's non-secret config from the
+ * builtin catalog. `*.secrets.json` files and `.history/` trails are
+ * preserved server-side by `scaffoldNewOrganization({override:true})`.
+ * Uploaded branding `images/` survive (branding is treated as a tree
+ * with per-file overwrite). Everything else under each `<org>/<domain>/`
+ * is overwritten with builtin content.
+ */
+
+import { confirm } from '../../utils/confirm';
+import * as logger from '../../utils/logger';
+import { exec } from '../docker/exec';
+import { findPlatformContainer } from '../docker/find-platform-container';
+
+export interface ReseedAllOrgsOptions {
+  dryRun: boolean;
+  assumeYes: boolean;
+}
+
+/**
+ * The bash script piped into the platform container. Adopts the proven
+ * env-sourcing pattern from scripts/2026-03-28-migrate-convex-data.sh so
+ * `INSTANCE_SECRET` is guaranteed populated and the admin key derivation
+ * matches the entrypoint's own runtime computation.
+ *
+ * Runtime workdir is `/app` (services/platform/Dockerfile sets
+ * `WORKDIR /app`; flattens services/platform/{convex,lib,env.sh,…} into
+ * `/app/`). No `cd /app/services/platform` — that path does not exist
+ * at runtime.
+ */
+const RESEED_SCRIPT = `set -eo pipefail
+source /app/env.sh
+env_normalize_common
+source /app/generate-admin-key.sh
+ensure_instance_secret
+ADMIN_KEY=$(generate_key "$INSTANCE_NAME" "$INSTANCE_SECRET")
+cd /app
+HOME=/home/app timeout 1800 bunx convex run \\
+  organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin \\
+  --url "\${CONVEX_URL:-http://convex:3210}" \\
+  --admin-key "$ADMIN_KEY"
+`;
+
+const CONFIRM_MESSAGE =
+  '--override-all will factory-reset every org from the builtin catalog. ' +
+  '*.secrets.json files, .history/ trails, and uploaded branding/images/ are preserved; ' +
+  'all other config (model lists, agents, workflows, skills, integrations, branding.json, retention.json) ' +
+  'is overwritten. Proceed?';
+
+export async function reseedAllOrgsFromBuiltin(
+  options: ReseedAllOrgsOptions,
+): Promise<void> {
+  const { dryRun, assumeYes } = options;
+
+  // Gate non-interactive callers behind --yes to avoid silent abort in CI.
+  const isTty = Boolean(process.stdin.isTTY);
+  if (!assumeYes && !isTty) {
+    throw new Error(
+      '--override-all requires --yes (-y) when stdin is not a TTY (e.g. CI).',
+    );
+  }
+  if (!assumeYes && isTty) {
+    const ok = await confirm(CONFIRM_MESSAGE);
+    if (!ok) {
+      logger.info('Aborted by user.');
+      return;
+    }
+  }
+
+  const container = await findPlatformContainer();
+
+  if (dryRun) {
+    logger.blank();
+    logger.info('[DRY-RUN] Would run:');
+    logger.info(`  docker exec ${container} bash -lc '<reseed script>'`);
+    logger.info('Reseed script body (would be piped into bash):');
+    for (const line of RESEED_SCRIPT.split('\n')) {
+      logger.info(`  ${line}`);
+    }
+    return;
+  }
+
+  logger.blank();
+  logger.step('Reseeding builtin catalog into all orgs...');
+
+  // Pipe the script via stdin instead of embedding in argv — avoids shell
+  // escaping pitfalls and keeps the script source readable.
+  const result = await exec('docker', ['exec', '-i', container, 'bash', '-s'], {
+    stdin: RESEED_SCRIPT,
+  });
+
+  if (!result.success) {
+    if (result.stderr) {
+      logger.error(result.stderr.trim());
+    }
+    throw new Error(
+      `--override-all failed (docker exec into ${container} returned non-zero).`,
+    );
+  }
+
+  // The action's return value is printed to stdout by `bunx convex run`.
+  if (result.stdout) {
+    const trimmed = result.stdout.trim();
+    if (trimmed) {
+      logger.info(trimmed);
+    }
+  }
+
+  logger.success('Reseed complete.');
+}
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index 634233cc94..47cdecdf9a 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -16,8 +16,6 @@ import { exec } from '../docker/exec';
 import { findProject } from '../project/find-project';
 import { resolveOrAssignProjectContext } from '../project/project-context';
 import { withLock } from '../state/with-lock';
-import { MIGRATIONS } from '../upgrade/registry';
-import { runPendingMigrations } from '../upgrade/runner';
 import { init } from './init';
 
 async function assertDockerAvailable(): Promise<void> {
@@ -122,8 +120,6 @@ interface StartOptions {
   detach?: boolean;
   port?: number;
   host?: string;
-  /** Non-interactive acceptance of any pending migrations (mirrors deploy). */
-  assumeYes?: boolean;
 }
 
 export async function start(options: StartOptions): Promise<void> {
@@ -154,67 +150,15 @@ export async function start(options: StartOptions): Promise<void> {
   await assertDockerAvailable();
 
   // Resolve project ID from tale.json before any Docker-resource naming.
-  // Auto-assign an ID for legacy projects so users don't have to run
-  // `tale upgrade` as a separate step before `tale start` works.
   await resolveOrAssignProjectContext(projectDir);
 
-  // Detect and apply any pending migrations, then ensure dev infrastructure,
-  // all under a project-scoped lock so parallel `tale start` / `tale deploy`
-  // shells can't race on docker volumes or migrations.json. The lock is
-  // released before `docker compose up` starts — holding it for the full
-  // foreground lifetime of compose would block every other tale command.
+  // Ensure dev infrastructure under a project-scoped lock so parallel
+  // `tale start` / `tale deploy` shells can't race on docker volumes.
+  // The lock is released before `docker compose up` starts — holding it
+  // for the full foreground lifetime of compose would block every other
+  // tale command.
   const devPrefix = `${getProjectId()}-dev_`;
   await withLock(projectDir, 'start', async () => {
-    const migrationResult = await runPendingMigrations(
-      MIGRATIONS,
-      { projectId: getProjectId(), projectDir },
-      {
-        context: 'start',
-        assumeYes: options.assumeYes,
-        async performStops(stops) {
-          // `stops` is the union of compose project names (e.g. legacy
-          // 'tale-dev') and individual container names (e.g.
-          // '${projectId}-dev-platform-blue'). Try each as a compose project
-          // first, then fall back to `docker stop` for container names.
-          // Failures here MUST surface — a silently-swallowed stop can let
-          // the migration copy a live volume, corrupting data.
-          for (const name of stops) {
-            const composeDown = await exec(
-              'docker',
-              ['compose', '-p', name, 'down', '--remove-orphans'],
-              { silent: true },
-            );
-            if (composeDown.success) continue;
-            const stopResult = await exec(
-              'docker',
-              ['stop', '-t', '30', name],
-              {
-                silent: true,
-              },
-            );
-            if (stopResult.success) continue;
-            // Neither channel worked. If the container genuinely doesn't
-            // exist, `docker stop` produces a specific stderr we can match;
-            // any other failure is a hard abort so we don't proceed to
-            // `cp -a` against a live volume.
-            const stderr = `${stopResult.stderr ?? ''}`.toLowerCase();
-            const looksMissing =
-              stderr.includes('no such container') ||
-              stderr.includes('not found');
-            if (!looksMissing) {
-              throw new Error(
-                `Failed to stop '${name}' before migration: ${stopResult.stderr?.trim() || 'unknown error'}`,
-              );
-            }
-          }
-        },
-      },
-    );
-    if (!migrationResult.proceed) {
-      logger.info('Aborting start until migrations are approved.');
-      process.exit(2);
-    }
-
     // Pre-create dev volumes and network with explicit project-scoped names.
     // The dev compose file references them as external, so they must exist
     // before `docker compose up`.
diff --git a/tools/cli/src/lib/actions/update.ts b/tools/cli/src/lib/actions/update.ts
index 48c8a82f9b..11f4ad7fbb 100644
--- a/tools/cli/src/lib/actions/update.ts
+++ b/tools/cli/src/lib/actions/update.ts
@@ -21,8 +21,6 @@ import { readProject } from '../project/read-project';
 import type { Checksums } from '../project/types';
 import { writeProject } from '../project/write-project';
 import { generateAllRules } from '../rules/generators';
-import { MIGRATIONS } from '../upgrade/registry';
-import { planPendingMigrations } from '../upgrade/runner';
 
 interface UpdateOptions {
   force?: boolean;
@@ -206,21 +204,6 @@ export async function update(options: UpdateOptions): Promise<void> {
     );
   }
 
-  // Plan (but do NOT apply) any pending migrations so operators know what
-  // `tale start` / `tale deploy` will prompt them about next. Never stops
-  // containers or modifies Docker state from within `tale upgrade` itself —
-  // production deployments remain untouched.
-  if (!options.dryRun) {
-    const projectId = assignedId ?? project.id;
-    if (projectId) {
-      logger.blank();
-      const pending = await planPendingMigrations(MIGRATIONS, {
-        projectId,
-        projectDir,
-      });
-      if (pending.length === 0) {
-        logger.debug('No pending migrations.');
-      }
-    }
-  }
+  // (Auto-migration planning removed — `tale migrate config-layout` is the
+  // only opt-in, manually-run migration now; operators invoke it directly.)
 }
diff --git a/tools/cli/src/lib/docker/exec.ts b/tools/cli/src/lib/docker/exec.ts
index 8b47f241c5..aa1b33a857 100644
--- a/tools/cli/src/lib/docker/exec.ts
+++ b/tools/cli/src/lib/docker/exec.ts
@@ -10,19 +10,42 @@ export interface ExecResult {
 export async function exec(
   command: string,
   args: string[],
-  options: { cwd?: string; silent?: boolean; timeout?: number } = {},
+  options: {
+    cwd?: string;
+    silent?: boolean;
+    timeout?: number;
+    /**
+     * Pipe this string into the child's stdin and close. Required for the
+     * `docker exec -i <container> bash -s` pattern used by reseed/migrate.
+     */
+    stdin?: string;
+  } = {},
 ): Promise<ExecResult> {
-  const { cwd, silent = false, timeout } = options;
+  const { cwd, silent = false, timeout, stdin } = options;
 
   if (!silent) {
     logger.debug(`Executing: ${command} ${args.join(' ')}`);
   }
 
-  const proc = Bun.spawn([command, ...args], {
-    cwd,
-    stdout: 'pipe',
-    stderr: 'pipe',
-  });
+  const proc =
+    stdin === undefined
+      ? Bun.spawn([command, ...args], {
+          cwd,
+          stdout: 'pipe',
+          stderr: 'pipe',
+        })
+      : Bun.spawn([command, ...args], {
+          cwd,
+          stdin: 'pipe',
+          stdout: 'pipe',
+          stderr: 'pipe',
+        });
+
+  if (stdin !== undefined) {
+    const sink = (proc as Bun.Subprocess<'pipe', 'pipe', 'pipe'>).stdin;
+    sink.write(stdin);
+    await sink.end();
+  }
 
   const exitPromise = timeout
     ? Promise.race([
diff --git a/tools/cli/src/lib/migrate-config-layout/script.sh b/tools/cli/src/lib/migrate-config-layout/script.sh
new file mode 100644
index 0000000000..40f2f9a850
--- /dev/null
+++ b/tools/cli/src/lib/migrate-config-layout/script.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Migrate providers/*.secrets.json from old per-domain layout to new
+# org-first layout. Idempotent. Uses cp (not mv) so old paths remain
+# readable until the operator runs `tale migrate config-layout --cleanup-old`.
+#
+# Old → new mapping:
+#   $DATA/providers/<name>.secrets.json
+#     → $DATA/default/providers/<name>.secrets.json
+#   $DATA/providers/<org>/<name>.secrets.json
+#     → $DATA/<org>/providers/<name>.secrets.json
+#
+# Scope: providers/*.secrets.json ONLY. Non-secret config is reseeded by
+# `tale deploy --override-all` against the builtin catalog; non-provider
+# .history/ trails under old paths are intentionally abandoned (the user's
+# "secrets only" runbook trade-off).
+#
+# Designed to run against the CURRENTLY-running convex container (old
+# image, old code paths still active). cp leaves old paths in place so
+# old code keeps reading providers correctly until the operator runs
+# `tale deploy --override-all -y` to recreate convex with the new code.
+set -eo pipefail
+
+DRY_RUN=0
+CLEANUP_OLD=0
+for arg in "$@"; do
+  case "$arg" in
+    --dry-run) DRY_RUN=1 ;;
+    --cleanup-old) CLEANUP_OLD=1 ;;
+    *) echo "Unknown arg: $arg" >&2; exit 2 ;;
+  esac
+done
+
+DATA="${TALE_CONFIG_DIR:-/app/data}"
+APP_UID=1001
+APP_GID=1001
+
+planned=0
+copied=0
+skipped=0
+removed=0
+errors=0
+conflicts=()
+
+# Move a single .secrets.json from old to new path. cp -a preserves mode +
+# ownership (encrypted secrets are 0600 owner:app). Idempotent: if the
+# destination already exists, verify byte-for-byte equality (then skip)
+# rather than overwriting — protects a concurrent UI-side `atomicWriteSecret`
+# that landed at the new path between this script's check and its copy.
+copy_secret() {
+  local src="$1" dst="$2"
+  local dst_dir; dst_dir="$(dirname "$dst")"
+  if [ -e "$dst" ]; then
+    if cmp -s "$src" "$dst" 2>/dev/null; then
+      skipped=$((skipped+1)); echo "SKIP (already migrated): $src"
+      return 0
+    else
+      conflicts+=("$src ≠ $dst")
+      errors=$((errors+1))
+      echo "ERROR: $dst exists but differs from $src; refusing to overwrite" >&2
+      return 0
+    fi
+  fi
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_PLAN: mkdir -p $dst_dir && cp -a $src $dst"
+    planned=$((planned+1))
+    return 0
+  fi
+  mkdir -p "$dst_dir"
+  chown "$APP_UID:$APP_GID" "$dst_dir" 2>/dev/null || true
+  cp -a "$src" "$dst"
+  copied=$((copied+1))
+  echo "OK: $src -> $dst"
+}
+
+# Remove an old-path secret IF the new-path copy exists and matches
+# byte-for-byte. Refuses any mismatch — operator must reconcile manually.
+remove_old_secret() {
+  local old="$1" new="$2"
+  if [ ! -e "$old" ]; then return 0; fi
+  if [ ! -e "$new" ]; then
+    conflicts+=("missing new-path counterpart for $old (expected $new)")
+    errors=$((errors+1))
+    echo "ERROR: $new does not exist; refusing to remove $old" >&2
+    return 0
+  fi
+  if ! cmp -s "$old" "$new" 2>/dev/null; then
+    conflicts+=("$old ≠ $new")
+    errors=$((errors+1))
+    echo "ERROR: $old and $new differ; refusing to remove $old" >&2
+    return 0
+  fi
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "CLEANUP_PLAN: rm $old"
+    planned=$((planned+1))
+    return 0
+  fi
+  rm -f "$old"
+  removed=$((removed+1))
+  echo "REMOVED: $old"
+}
+
+# ---------------------------------------------------------------------------
+# Enumeration
+# ---------------------------------------------------------------------------
+process_secret() {
+  local src="$1" dst="$2"
+  if [ "$CLEANUP_OLD" = 1 ]; then
+    remove_old_secret "$src" "$dst"
+  else
+    copy_secret "$src" "$dst"
+  fi
+}
+
+# Default org: top-level $DATA/providers/*.secrets.json → $DATA/default/providers/
+if [ -d "$DATA/providers" ]; then
+  for f in "$DATA"/providers/*.secrets.json; do
+    [ -f "$f" ] || continue
+    process_secret "$f" "$DATA/default/providers/$(basename "$f")"
+  done
+fi
+
+# Non-default orgs: $DATA/providers/<org>/*.secrets.json → $DATA/<org>/providers/
+if [ -d "$DATA/providers" ]; then
+  for d in "$DATA"/providers/*/; do
+    [ -d "$d" ] || continue
+    org="$(basename "$d")"
+    case "$org" in
+      .*) continue ;;
+    esac
+    # Validate against ORG_SLUG_REGEX (keep in sync with validateOrgSlug
+    # at services/platform/convex/lib/file_io.ts). Anything that doesn't
+    # match is skipped with a warning — defends against `.history` or
+    # future hidden markers leaking into the iteration.
+    if ! [[ "$org" =~ ^[a-z0-9][a-z0-9_-]{0,63}$ ]]; then
+      echo "SKIP (not a valid org slug): $org" >&2
+      skipped=$((skipped+1))
+      continue
+    fi
+    for f in "$d"*.secrets.json; do
+      [ -f "$f" ] || continue
+      process_secret "$f" "$DATA/$org/providers/$(basename "$f")"
+    done
+  done
+fi
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo
+if [ "$CLEANUP_OLD" = 1 ]; then
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_SUMMARY: planned=$planned removed=0 errors=$errors (cleanup-old --dry-run)"
+  else
+    echo "MIGRATE_SUMMARY: removed=$removed errors=$errors (cleanup-old)"
+  fi
+else
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_SUMMARY: planned=$planned copied=0 skipped=$skipped errors=$errors (--dry-run)"
+  else
+    echo "MIGRATE_SUMMARY: copied=$copied skipped=$skipped errors=$errors"
+  fi
+  if [ "$copied" -gt 0 ] || [ "$planned" -gt 0 ]; then
+    echo "Next: run 'tale deploy --override-all -y' to recreate convex with the new code and seed non-default orgs."
+  fi
+fi
+if [ "${#conflicts[@]}" -gt 0 ]; then
+  echo
+  echo "Unresolved conflicts (require manual reconciliation):"
+  for c in "${conflicts[@]}"; do
+    echo "  - $c"
+  done
+fi
+[ "$errors" -eq 0 ] || exit 1
diff --git a/tools/cli/src/lib/project/fetch-reference.ts b/tools/cli/src/lib/project/fetch-reference.ts
index e9e70fde5b..89628449da 100644
--- a/tools/cli/src/lib/project/fetch-reference.ts
+++ b/tools/cli/src/lib/project/fetch-reference.ts
@@ -22,9 +22,15 @@ export async function fetchReference(projectDir: string): Promise<void> {
   }
 }
 
+/**
+ * Read a slice of the embedded `examples/default/<prefix>/...` tree as a
+ * map of `<rest>` → content. The catalog ships only the canonical `default`
+ * org's seed under `examples/default/`; the org-first layout repeats the
+ * same shape for any number of orgs at runtime.
+ */
 export function getEmbeddedExamples(prefix: string): Map<string, string> {
   const result = new Map<string, string>();
-  const examplesPrefix = `examples/${prefix}/`;
+  const examplesPrefix = `examples/default/${prefix}/`;
 
   for (const [path, content] of Object.entries(EMBEDDED_EXAMPLES)) {
     if (path.startsWith(examplesPrefix)) {
diff --git a/tools/cli/src/lib/upgrade/migrations/adopt-convex-stateful.ts b/tools/cli/src/lib/upgrade/migrations/adopt-convex-stateful.ts
deleted file mode 100644
index 8d0ac07835..0000000000
--- a/tools/cli/src/lib/upgrade/migrations/adopt-convex-stateful.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-import * as logger from '../../../utils/logger';
-import { docker } from '../../docker/docker';
-import type { Migration, MigrationContext } from '../types';
-
-/**
- * Convex was previously emitted in the color compose file (blue/green project)
- * even though it is a singleton. This migration detects the existing convex
- * container under a color project and removes it so the stateful compose can
- * recreate it under the main project. The convex-data volume is external and
- * is not affected.
- */
-
-async function getContainerProjectLabel(
-  containerName: string,
-): Promise<string | null> {
-  const result = await docker(
-    'inspect',
-    '--format',
-    '{{index .Config.Labels "com.docker.compose.project"}}',
-    containerName,
-  );
-  if (!result.success) return null;
-  const label = result.stdout.trim();
-  return label || null;
-}
-
-export const adoptConvexStatefulMigration: Migration = {
-  id: 'adopt-convex-stateful',
-  introducedIn: '0.3.1',
-  description: (ctx: MigrationContext) =>
-    `Move ${ctx.projectId}-convex container from blue/green project scope to stateful project scope (${ctx.projectId}).`,
-
-  async detect(ctx: MigrationContext): Promise<boolean> {
-    const label = await getContainerProjectLabel(`${ctx.projectId}-convex`);
-    if (!label) return false; // container doesn't exist — fresh install
-    return label !== ctx.projectId; // needs migration if owned by a color project
-  },
-
-  async requiredStops(ctx: MigrationContext): Promise<string[]> {
-    const label = await getContainerProjectLabel(`${ctx.projectId}-convex`);
-    if (!label || label === ctx.projectId) return [];
-    return [`${ctx.projectId}-convex`];
-  },
-
-  async apply(ctx, { dryRun }) {
-    if (dryRun) return 'noop';
-
-    const containerName = `${ctx.projectId}-convex`;
-    const label = await getContainerProjectLabel(containerName);
-    if (!label || label === ctx.projectId) return 'noop';
-
-    logger.info(
-      `  Removing ${containerName} (owned by project "${label}") so it can be recreated under "${ctx.projectId}"`,
-    );
-    const result = await docker('rm', '-f', containerName);
-    if (!result.success) {
-      throw new Error(
-        `Failed to remove ${containerName}: ${result.stderr.trim()}`,
-      );
-    }
-
-    logger.info(
-      '  The convex-data volume is preserved. The container will be recreated by the next deploy.',
-    );
-    return 'applied';
-  },
-};
diff --git a/tools/cli/src/lib/upgrade/migrations/namespace-caddy-config.ts b/tools/cli/src/lib/upgrade/migrations/namespace-caddy-config.ts
deleted file mode 100644
index aeea94b558..0000000000
--- a/tools/cli/src/lib/upgrade/migrations/namespace-caddy-config.ts
+++ /dev/null
@@ -1,77 +0,0 @@
-import * as logger from '../../../utils/logger';
-import type { Migration, MigrationContext } from '../types';
-import {
-  copyVolumeWithVerify,
-  resolveMigrationImage,
-  volumeExists,
-  volumeHasData,
-} from '../volume-helpers';
-
-/**
- * Supplemental fix: `caddy-config` was accidentally omitted from PROD_VOLUMES
- * in the namespace-volumes migration, so `tale_caddy-config` was never copied
- * to `${projectId}_caddy-config` for existing production deployments.
- *
- * This migration uses the same idempotent end-state check: only copies if the
- * source has data and the destination is absent or empty.
- */
-
-const LEGACY_PROJECT_NAME = 'tale';
-
-export const namespaceCaddyConfigMigration: Migration = {
-  id: 'namespace-caddy-config',
-  introducedIn: '0.3.1',
-  description: (ctx: MigrationContext) =>
-    `Copy ${LEGACY_PROJECT_NAME}_caddy-config to ${ctx.projectId}_caddy-config (missed by namespace-volumes).`,
-
-  async detect(ctx: MigrationContext): Promise<boolean> {
-    const oldName = `${LEGACY_PROJECT_NAME}_caddy-config`;
-    const newName = `${ctx.projectId}_caddy-config`;
-
-    if (!(await volumeExists(oldName))) return false;
-
-    // If destination already has data, nothing to do.
-    const image = await resolveMigrationImage();
-    if (
-      (await volumeExists(newName)) &&
-      (await volumeHasData(newName, image))
-    ) {
-      return false;
-    }
-
-    return volumeHasData(oldName, image);
-  },
-
-  async requiredStops(): Promise<string[]> {
-    // Proxy is the only consumer of caddy-config and it lives in the
-    // stateful compose under the namespaced project. The legacy compose
-    // projects ('tale', etc.) were already torn down by namespace-volumes.
-    return [];
-  },
-
-  async apply(ctx, { dryRun }) {
-    if (dryRun) return 'noop';
-
-    const oldName = `${LEGACY_PROJECT_NAME}_caddy-config`;
-    const newName = `${ctx.projectId}_caddy-config`;
-
-    const image = await resolveMigrationImage();
-
-    // Re-check end-state (idempotent).
-    if (
-      (await volumeExists(newName)) &&
-      (await volumeHasData(newName, image))
-    ) {
-      return 'noop';
-    }
-    if (!(await volumeExists(oldName))) return 'noop';
-    if (!(await volumeHasData(oldName, image))) return 'noop';
-
-    logger.info(`  ${oldName} → ${newName}`);
-    await copyVolumeWithVerify(oldName, newName, image);
-
-    logger.info('Old volume preserved. After verifying, reclaim disk with:');
-    logger.info(`  docker volume rm ${oldName}`);
-    return 'applied';
-  },
-};
diff --git a/tools/cli/src/lib/upgrade/migrations/namespace-volumes.ts b/tools/cli/src/lib/upgrade/migrations/namespace-volumes.ts
deleted file mode 100644
index 36b0f3bdcb..0000000000
--- a/tools/cli/src/lib/upgrade/migrations/namespace-volumes.ts
+++ /dev/null
@@ -1,171 +0,0 @@
-import * as logger from '../../../utils/logger';
-import { docker } from '../../docker/docker';
-import type { Migration, MigrationContext } from '../types';
-import {
-  copyVolumeWithVerify,
-  resolveMigrationImage,
-  stopContainerOrThrow,
-  volumeExists,
-  volumeHasData,
-} from '../volume-helpers';
-
-/**
- * Pre-0.2.33 hard-coded project name. Volumes and containers from that era
- * were all prefixed with `tale_` / `tale-dev_` / `tale-blue_` / `tale-green_`
- * because `docker compose` used the fixed `-p tale` flag.
- */
-const LEGACY_PROJECT_NAME = 'tale';
-
-const DEV_VOLUMES = [
-  'platform-data',
-  'db-data',
-  'db-backup',
-  'rag-data',
-  'crawler-data',
-  'caddy-data',
-  'caddy-config',
-];
-const PROD_VOLUMES = [
-  'platform-data',
-  'caddy-data',
-  'caddy-config',
-  'rag-data',
-  'crawler-data',
-  'db-data',
-  'db-backup',
-];
-
-function buildPairs(
-  projectId: string,
-): Array<{ oldName: string; newName: string }> {
-  const pairs: Array<{ oldName: string; newName: string }> = [];
-  for (const v of DEV_VOLUMES) {
-    pairs.push({
-      oldName: `${LEGACY_PROJECT_NAME}-dev_${v}`,
-      newName: `${projectId}-dev_${v}`,
-    });
-  }
-  for (const v of PROD_VOLUMES) {
-    pairs.push({
-      oldName: `${LEGACY_PROJECT_NAME}_${v}`,
-      newName: `${projectId}_${v}`,
-    });
-  }
-  return pairs;
-}
-
-async function findRunningLegacyContainers(): Promise<string[]> {
-  const r = await docker(
-    'ps',
-    '--filter',
-    'name=tale-',
-    '--format',
-    '{{.Names}}',
-  );
-  if (!r.success) return [];
-  const legacyPattern =
-    /^tale(-(dev|blue|green))?-(platform|db|rag|crawler|proxy)(-(blue|green))?$/;
-  return r.stdout
-    .split('\n')
-    .map((l) => l.trim())
-    .filter((name) => name && legacyPattern.test(name));
-}
-
-/** Pairs where the end-state does NOT yet hold and there is something to copy.
- *
- *  End-state for this migration: the destination volume exists and has data.
- *  Therefore a pair is pending iff:
- *    - destination is absent or empty, AND
- *    - source exists and has data.
- *
- *  If the destination already has data we always skip — regardless of whether
- *  a sentinel is present, regardless of what legacy volumes sit on the host.
- *  This is the key idempotency guarantee: a project whose namespaced volumes
- *  were populated by the compose stack directly (v0.2.33+ fresh init, or a
- *  previous successful migration) must never be touched by this migration
- *  again, even if stray `tale-dev_*` volumes from unrelated installs exist. */
-async function findPending(
-  projectId: string,
-  image: string,
-): Promise<Array<{ oldName: string; newName: string }>> {
-  const all = buildPairs(projectId);
-  const pending: Array<{ oldName: string; newName: string }> = [];
-  for (const p of all) {
-    // End-state check first: if the destination already has data, this pair
-    // is satisfied. We do not trust, nor require, the sentinel here — a
-    // populated destination that predates the migration infrastructure
-    // (v0.2.33 fresh inits) will legitimately lack one.
-    if (
-      (await volumeExists(p.newName)) &&
-      (await volumeHasData(p.newName, image))
-    ) {
-      continue;
-    }
-    // Destination is absent or empty. Only migrate if there's actual source
-    // data to copy — an empty source would just recreate an empty dst.
-    if (!(await volumeExists(p.oldName))) continue;
-    if (!(await volumeHasData(p.oldName, image))) continue;
-    pending.push(p);
-  }
-  return pending;
-}
-
-export const namespaceVolumesMigration: Migration = {
-  id: 'namespace-volumes',
-  introducedIn: '0.2.33',
-  description: (ctx: MigrationContext) =>
-    `Rename legacy Docker volumes (tale_* / tale-dev_*) to the per-project scope (${ctx.projectId}_*).`,
-
-  async detect(ctx: MigrationContext): Promise<boolean> {
-    // Cheap shortcut: if no legacy source volume exists anywhere on the host,
-    // there is nothing we could ever copy. Bail before pulling an image.
-    const all = buildPairs(ctx.projectId);
-    let anySourceExists = false;
-    for (const p of all) {
-      if (await volumeExists(p.oldName)) {
-        anySourceExists = true;
-        break;
-      }
-    }
-    if (!anySourceExists) return false;
-    // Otherwise defer to findPending — it applies the full end-state check
-    // per pair and is the single source of truth for "do we have work?".
-    const image = await resolveMigrationImage();
-    return (await findPending(ctx.projectId, image)).length > 0;
-  },
-
-  async requiredStops(): Promise<string[]> {
-    // Legacy compose project names we might need to bring down. These were
-    // the only names in use pre-0.2.33.
-    return ['tale', 'tale-blue', 'tale-green', 'tale-dev'];
-  },
-
-  async apply(ctx, { dryRun }) {
-    if (dryRun) return 'noop';
-
-    // Extra safety: never run while legacy containers are live. The runner
-    // should already have stopped them via requiredStops → performStops, but
-    // a running container here means the caller's stop logic didn't fully
-    // cover the surface.
-    const running = await findRunningLegacyContainers();
-    if (running.length > 0) {
-      // Stop them individually; if that fails, bail out loudly rather than
-      // copying over a live volume.
-      for (const name of running) await stopContainerOrThrow(name);
-    }
-
-    const image = await resolveMigrationImage();
-    const pending = await findPending(ctx.projectId, image);
-    if (pending.length === 0) return 'noop';
-
-    for (const { oldName, newName } of pending) {
-      logger.info(`  ${oldName} → ${newName}`);
-      await copyVolumeWithVerify(oldName, newName, image);
-    }
-
-    logger.info('Old volumes preserved. After verifying, reclaim disk with:');
-    const oldNames = pending.map((p) => p.oldName).join(' ');
-    logger.info(`  docker volume rm ${oldNames}`);
-    return 'applied';
-  },
-};
diff --git a/tools/cli/src/lib/upgrade/migrations/split-convex.ts b/tools/cli/src/lib/upgrade/migrations/split-convex.ts
deleted file mode 100644
index 7477e8dc1e..0000000000
--- a/tools/cli/src/lib/upgrade/migrations/split-convex.ts
+++ /dev/null
@@ -1,154 +0,0 @@
-import * as logger from '../../../utils/logger';
-import { docker } from '../../docker/docker';
-import type { Migration, MigrationContext } from '../types';
-import {
-  copyVolumeWithVerify,
-  resolveMigrationImage,
-  stopContainerOrThrow,
-  volumeExists,
-  volumeHasData,
-} from '../volume-helpers';
-
-interface SplitPair {
-  oldName: string;
-  newName: string;
-  scope: 'prod' | 'dev';
-}
-
-function buildPairs(projectId: string): SplitPair[] {
-  return [
-    {
-      oldName: `${projectId}_platform-data`,
-      newName: `${projectId}_convex-data`,
-      scope: 'prod',
-    },
-    {
-      oldName: `${projectId}-dev_platform-data`,
-      newName: `${projectId}-dev_convex-data`,
-      scope: 'dev',
-    },
-  ];
-}
-
-/** Pairs where the end-state does NOT yet hold and there is something to copy.
- *
- *  End-state: the new `*_convex-data` volume exists and has data.
- *
- *  A pair is pending iff the destination is absent or empty AND the old
- *  platform-data volume exists with data. We deliberately do not require a
- *  sentinel on the destination — a destination populated by the compose
- *  stack (e.g. a fresh install of a CLI that already ships the split layout)
- *  legitimately has no sentinel and must be left alone. */
-async function findPending(
-  projectId: string,
-  image: string,
-): Promise<SplitPair[]> {
-  const pending: SplitPair[] = [];
-  for (const p of buildPairs(projectId)) {
-    // End-state check: if dst already has data, this pair is satisfied.
-    if (
-      (await volumeExists(p.newName)) &&
-      (await volumeHasData(p.newName, image))
-    ) {
-      continue;
-    }
-    if (!(await volumeExists(p.oldName))) continue;
-    if (!(await volumeHasData(p.oldName, image))) continue;
-    pending.push(p);
-  }
-  return pending;
-}
-
-async function findContainersUsingPlatformData(
-  projectId: string,
-): Promise<string[]> {
-  // Match platform/convex containers under both prod and dev project scopes.
-  const prefixes = [`${projectId}-`, `${projectId}-dev-`];
-  const names: string[] = [];
-  for (const prefix of prefixes) {
-    const r = await docker(
-      'ps',
-      '-a',
-      '--filter',
-      `name=${prefix}`,
-      '--format',
-      '{{.Names}}',
-    );
-    if (!r.success) continue;
-    for (const raw of r.stdout.split('\n')) {
-      const n = raw.trim();
-      if (!n) continue;
-      if (!/(platform|convex)/.test(n)) continue;
-      if (!names.includes(n)) names.push(n);
-    }
-  }
-  return names;
-}
-
-export const splitConvexMigration: Migration = {
-  id: 'split-convex',
-  introducedIn: '0.3.0',
-  description: (ctx: MigrationContext) =>
-    `Copy ${ctx.projectId}_platform-data into ${ctx.projectId}_convex-data so the new dedicated Convex service can own its data volume.`,
-
-  async detect(ctx: MigrationContext): Promise<boolean> {
-    // Cheap check first: if no legacy platform-data volume exists at all,
-    // there is nothing we could ever copy — bail before pulling an image.
-    const pairs = buildPairs(ctx.projectId);
-    let anyOldExists = false;
-    for (const p of pairs) {
-      if (await volumeExists(p.oldName)) {
-        anyOldExists = true;
-        break;
-      }
-    }
-    if (!anyOldExists) return false;
-    const image = await resolveMigrationImage();
-    return (await findPending(ctx.projectId, image)).length > 0;
-  },
-
-  async requiredStops(ctx): Promise<string[]> {
-    // Individual container names, not compose project names — the runner
-    // passes these through to its caller's stop routine. `tale deploy` /
-    // `tale start` both issue `docker compose -p <project> down` for compose
-    // projects; for individual containers we still want them stopped, so we
-    // surface them verbatim and let the caller decide how to stop.
-    return findContainersUsingPlatformData(ctx.projectId);
-  },
-
-  async apply(ctx, { dryRun }) {
-    if (dryRun) return 'noop';
-
-    const image = await resolveMigrationImage();
-    const pending = await findPending(ctx.projectId, image);
-    if (pending.length === 0) return 'noop';
-
-    // Defensive: any platform/convex container that's still running at this
-    // point holds open file handles against the volume we're about to copy.
-    // The runner should have stopped them, but verify.
-    for (const name of await findContainersUsingPlatformData(ctx.projectId)) {
-      const inspect = await docker(
-        'inspect',
-        '--format',
-        '{{.State.Running}}',
-        name,
-      );
-      if (inspect.success && inspect.stdout.trim() === 'true') {
-        await stopContainerOrThrow(name);
-      }
-    }
-
-    for (const p of pending) {
-      logger.info(`  [${p.scope}] ${p.oldName} → ${p.newName}`);
-      await copyVolumeWithVerify(p.oldName, p.newName, image);
-    }
-
-    logger.info(
-      'Legacy platform-data volumes are preserved. After verifying the new convex service is healthy, reclaim disk with:',
-    );
-    for (const p of pending) {
-      logger.info(`  docker volume rm ${p.oldName}`);
-    }
-    return 'applied';
-  },
-};
diff --git a/tools/cli/src/lib/upgrade/registry.ts b/tools/cli/src/lib/upgrade/registry.ts
deleted file mode 100644
index b691e31e95..0000000000
--- a/tools/cli/src/lib/upgrade/registry.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { adoptConvexStatefulMigration } from './migrations/adopt-convex-stateful';
-import { namespaceCaddyConfigMigration } from './migrations/namespace-caddy-config';
-import { namespaceVolumesMigration } from './migrations/namespace-volumes';
-import { splitConvexMigration } from './migrations/split-convex';
-import type { Migration } from './types';
-
-/**
- * Ordered registry of all known upgrade steps.
- *
- * This is NOT a per-release changelog — each entry is a one-shot data
- * migration the CLI knows how to apply. Steps are idempotent (gated by
- * detect()), so the registry only grows when a release actually needs to
- * mutate user state on the host (Docker volumes, on-disk files, …).
- *
- * Order matters: each entry may assume every earlier entry has run (or
- * reported "nothing to do" via detect()). Never reorder; only append.
- */
-export const MIGRATIONS: readonly Migration[] = [
-  namespaceVolumesMigration, // v0.2.33 — rename tale_* → ${projectId}_*
-  splitConvexMigration, // v0.3.0  — platform-data → convex-data
-  namespaceCaddyConfigMigration, // v0.3.1  — fix: caddy-config missed by namespace-volumes
-  adoptConvexStatefulMigration, // v0.3.1  — convex from color→stateful project
-];
diff --git a/tools/cli/src/lib/upgrade/runner.test.ts b/tools/cli/src/lib/upgrade/runner.test.ts
deleted file mode 100644
index 34361e41eb..0000000000
--- a/tools/cli/src/lib/upgrade/runner.test.ts
+++ /dev/null
@@ -1,230 +0,0 @@
-import { afterEach, describe, expect, mock, test } from 'bun:test';
-
-import type { ApplyOutcome, Migration, MigrationContext } from './types';
-
-// --- Mocks ---
-
-const recordAppliedMock = mock();
-
-mock.module('./state', () => ({
-  recordApplied: recordAppliedMock,
-}));
-
-mock.module('../../utils/logger', () => ({
-  blank: mock(),
-  header: mock(),
-  info: mock(),
-  notice: mock(),
-  step: mock(),
-  success: mock(),
-  warn: mock(),
-  debug: mock(),
-  error: mock(),
-}));
-
-mock.module('../../utils/confirm', () => ({
-  confirm: mock(() => Promise.resolve(true)),
-}));
-
-// --- Helpers ---
-
-const CTX: MigrationContext = {
-  projectId: 'test-project',
-  projectDir: '/tmp/test-project',
-};
-
-function makeMigration(
-  id: string,
-  opts: {
-    detect?: boolean;
-    apply?: ApplyOutcome;
-    stops?: string[];
-    detectFn?: () => Promise<boolean>;
-  } = {},
-): Migration {
-  return {
-    id,
-    introducedIn: '0.0.1',
-    description: `Migration ${id}`,
-    detect: opts.detectFn ?? mock(() => Promise.resolve(opts.detect ?? false)),
-    requiredStops: mock(() => Promise.resolve(opts.stops ?? [])),
-    apply: mock(() => Promise.resolve(opts.apply ?? 'applied')),
-  };
-}
-
-// --- Import after mocks ---
-
-const { runPendingMigrations, planPendingMigrations } =
-  await import('./runner');
-
-afterEach(() => {
-  recordAppliedMock.mockReset();
-});
-
-// --- Tests ---
-
-describe('runPendingMigrations', () => {
-  test('returns proceed=true with no applied when nothing is pending', async () => {
-    const m = makeMigration('a', { detect: false });
-    const result = await runPendingMigrations([m], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result).toEqual({ proceed: true, applied: [], declined: false });
-    expect(m.detect).toHaveBeenCalledWith(CTX);
-    expect(m.apply).not.toHaveBeenCalled();
-  });
-
-  test('applies a new pending migration', async () => {
-    const m = makeMigration('a', { detect: true, apply: 'applied' });
-    const result = await runPendingMigrations([m], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result.proceed).toBe(true);
-    expect(result.applied).toEqual(['a']);
-    expect(m.apply).toHaveBeenCalledWith(CTX, { dryRun: false });
-    expect(recordAppliedMock).toHaveBeenCalledTimes(1);
-    expect(recordAppliedMock.mock.calls[0][1]).toMatchObject({ id: 'a' });
-  });
-
-  test('re-detects and re-applies a drifted migration (detect returns true even if previously recorded)', async () => {
-    // Simulate a migration whose end-state has drifted: detect() returns true
-    // even though recordApplied would be a no-op (already recorded).
-    // The key assertion: detect() IS called, apply() IS called.
-    const m = makeMigration('split-convex', {
-      detect: true,
-      apply: 'applied',
-    });
-
-    const result = await runPendingMigrations([m], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result.proceed).toBe(true);
-    expect(result.applied).toEqual(['split-convex']);
-    expect(m.detect).toHaveBeenCalledTimes(1);
-    expect(m.apply).toHaveBeenCalledTimes(1);
-  });
-
-  test('skips migrations whose detect() returns false', async () => {
-    const satisfied = makeMigration('done', { detect: false });
-    const pending = makeMigration('todo', { detect: true, apply: 'applied' });
-
-    const result = await runPendingMigrations([satisfied, pending], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result.applied).toEqual(['todo']);
-    expect(satisfied.detect).toHaveBeenCalled();
-    expect(satisfied.apply).not.toHaveBeenCalled();
-  });
-
-  test('preserves registry order for mixed pending migrations', async () => {
-    const a = makeMigration('a', { detect: true, apply: 'applied' });
-    const b = makeMigration('b', { detect: false });
-    const c = makeMigration('c', { detect: true, apply: 'applied' });
-
-    const result = await runPendingMigrations([a, b, c], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result.applied).toEqual(['a', 'c']);
-    // Verify order: a applied before c
-    const aCallOrder = (a.apply as ReturnType<typeof mock>).mock
-      .invocationCallOrder[0];
-    const cCallOrder = (c.apply as ReturnType<typeof mock>).mock
-      .invocationCallOrder[0];
-    expect(aCallOrder).toBeLessThan(cCallOrder);
-  });
-
-  test('propagates detect() errors with context', async () => {
-    const m = makeMigration('bad', {
-      detectFn: () => Promise.reject(new Error('docker not found')),
-    });
-
-    await expect(
-      runPendingMigrations([m], CTX, {
-        context: 'deploy',
-        assumeYes: true,
-      }),
-    ).rejects.toThrow('migration bad: detect() failed: docker not found');
-  });
-
-  test('collects requiredStops from all pending migrations', async () => {
-    const a = makeMigration('a', {
-      detect: true,
-      apply: 'applied',
-      stops: ['container-1'],
-    });
-    const b = makeMigration('b', {
-      detect: true,
-      apply: 'applied',
-      stops: ['container-2', 'container-1'],
-    });
-
-    const stopsReceived: string[][] = [];
-    const performStops = mock((s: string[]) => {
-      stopsReceived.push(s);
-      return Promise.resolve();
-    });
-
-    await runPendingMigrations([a, b], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-      performStops,
-    });
-
-    expect(performStops).toHaveBeenCalledTimes(1);
-    expect(stopsReceived[0]).toContain('container-1');
-    expect(stopsReceived[0]).toContain('container-2');
-    expect(stopsReceived[0]).toHaveLength(2); // deduplicated
-  });
-
-  test('handles noop outcome from apply()', async () => {
-    const m = makeMigration('already-ok', { detect: true, apply: 'noop' });
-
-    const result = await runPendingMigrations([m], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-    });
-
-    expect(result.applied).toEqual(['already-ok']);
-    expect(recordAppliedMock).toHaveBeenCalledTimes(1);
-  });
-
-  test('dry-run prints plan but does not apply', async () => {
-    const m = makeMigration('a', { detect: true });
-
-    const result = await runPendingMigrations([m], CTX, {
-      context: 'deploy',
-      assumeYes: true,
-      dryRun: true,
-    });
-
-    expect(result.applied).toEqual([]);
-    expect(m.apply).not.toHaveBeenCalled();
-    expect(recordAppliedMock).not.toHaveBeenCalled();
-  });
-});
-
-describe('planPendingMigrations', () => {
-  test('returns empty array when nothing is pending', async () => {
-    const m = makeMigration('a', { detect: false });
-    const result = await planPendingMigrations([m], CTX);
-    expect(result).toEqual([]);
-  });
-
-  test('returns pending migrations without applying', async () => {
-    const m = makeMigration('a', { detect: true });
-    const result = await planPendingMigrations([m], CTX);
-    expect(result).toHaveLength(1);
-    expect(result[0].id).toBe('a');
-    expect(m.apply).not.toHaveBeenCalled();
-  });
-});
diff --git a/tools/cli/src/lib/upgrade/runner.ts b/tools/cli/src/lib/upgrade/runner.ts
deleted file mode 100644
index 5fff0cfd7f..0000000000
--- a/tools/cli/src/lib/upgrade/runner.ts
+++ /dev/null
@@ -1,199 +0,0 @@
-import pkg from '../../../package.json';
-import { confirm } from '../../utils/confirm';
-import * as logger from '../../utils/logger';
-import { recordApplied } from './state';
-import type { Migration, MigrationContext } from './types';
-
-/**
- * Compute the pending subset of migrations: those whose `detect()` returns
- * true for the current observable state.
- *
- * Per the contract in types.ts, `detect()` is the sole source of truth —
- * `migrations.json` is a log, not a gate. A migration whose end-state has
- * drifted (e.g. a volume was deleted after the migration was recorded) will
- * be re-detected and re-applied automatically.
- *
- * Order is preserved from the registry — callers must not reorder.
- */
-async function computePending(
-  registry: readonly Migration[],
-  ctx: MigrationContext,
-): Promise<Migration[]> {
-  const pending: Migration[] = [];
-  for (const m of registry) {
-    try {
-      if (await m.detect(ctx)) pending.push(m);
-    } catch (err) {
-      // A failing detect() should not silently drop the migration — surface
-      // it loudly so operators can investigate before we either apply an
-      // unsafe migration or skip a necessary one.
-      throw new Error(
-        `migration ${m.id}: detect() failed: ${err instanceof Error ? err.message : String(err)}`,
-        { cause: err },
-      );
-    }
-  }
-  return pending;
-}
-
-function resolveDescription(m: Migration, ctx: MigrationContext): string {
-  return typeof m.description === 'function'
-    ? m.description(ctx)
-    : m.description;
-}
-
-function printPlan(
-  pending: readonly Migration[],
-  stops: readonly string[],
-  ctx: MigrationContext,
-): void {
-  logger.blank();
-  logger.header(`${pending.length} pending migration(s)`);
-  for (const m of pending) {
-    logger.info(`  • ${m.id} (introduced in ${m.introducedIn})`);
-    logger.info(`      ${resolveDescription(m, ctx)}`);
-  }
-  if (stops.length > 0) {
-    logger.blank();
-    logger.info('The following compose projects / containers will be stopped:');
-    for (const s of stops) logger.info(`  - ${s}`);
-  }
-  logger.blank();
-}
-
-function isNonInteractive(): boolean {
-  return !(process.stdin.isTTY && process.stdout.isTTY);
-}
-
-interface RunPendingOptions {
-  /** Where we're being called from — used in messages only. */
-  context: 'start' | 'deploy' | 'upgrade';
-  /** Skip the interactive prompt and proceed. Required for non-TTY use. */
-  assumeYes?: boolean;
-  /** Print the plan but apply nothing. */
-  dryRun?: boolean;
-  /**
-   * Callback invoked with the union of `requiredStops()` across pending
-   * migrations, before apply runs. Callers implement the actual
-   * `docker compose -p <name> down` since that behaviour varies between
-   * start and deploy call sites.
-   */
-  performStops?: (stops: string[]) => Promise<void>;
-}
-
-interface RunPendingResult {
-  /** True if the caller should keep executing the original command. */
-  proceed: boolean;
-  /** Migrations that ran successfully this pass. */
-  applied: string[];
-  /** True if migrations were pending but the user declined to apply. */
-  declined: boolean;
-}
-
-/**
- * Runs the pending-migration pipeline.
- *
- *  - If nothing is pending → proceed=true, no side effects.
- *  - If pending and interactive → print plan, prompt [y/N]:
- *      yes → apply all in order, record each, proceed=true
- *      no  → exit cleanly with proceed=false, declined=true, no side effects
- *  - If pending and non-TTY and not assumeYes → throw (caller turns this
- *    into a process exit with a clear error).
- *  - If pending and (TTY or assumeYes) → apply all in order.
- *
- * This is the single entry point used by `tale start` and `tale deploy`.
- */
-export async function runPendingMigrations(
-  registry: readonly Migration[],
-  ctx: MigrationContext,
-  opts: RunPendingOptions,
-): Promise<RunPendingResult> {
-  const pending = await computePending(registry, ctx);
-  if (pending.length === 0) {
-    return { proceed: true, applied: [], declined: false };
-  }
-
-  // Collect the union of requiredStops across pending migrations so we can
-  // show the full blast radius up front.
-  const stopsSet = new Set<string>();
-  for (const m of pending) {
-    for (const s of await m.requiredStops(ctx)) stopsSet.add(s);
-  }
-  const stops = [...stopsSet];
-
-  printPlan(pending, stops, ctx);
-
-  if (opts.dryRun) {
-    logger.notice(
-      'DRY RUN — migrations NOT applied. Re-run without --dry-run to apply.',
-    );
-    return { proceed: true, applied: [], declined: false };
-  }
-
-  // Decide whether to proceed.
-  let approved = opts.assumeYes === true;
-  if (!approved) {
-    if (isNonInteractive()) {
-      throw new Error(
-        'Pending migrations detected but stdin/stdout is not a TTY. ' +
-          'Re-run this command in an interactive shell to confirm, or pass --yes to accept non-interactively.',
-      );
-    }
-    approved = await confirm(
-      `Apply ${pending.length} pending migration(s) now?`,
-    );
-    if (!approved) {
-      logger.info('Migration declined. Nothing changed.');
-      return { proceed: false, applied: [], declined: true };
-    }
-  }
-
-  // Stop everything the pending migrations need down.
-  if (stops.length > 0 && opts.performStops) {
-    logger.step('Stopping containers before migration...');
-    await opts.performStops(stops);
-  }
-
-  // Apply in order. Record each as soon as it succeeds so a mid-pipeline
-  // failure leaves us resumable.
-  const applied: string[] = [];
-  for (const m of pending) {
-    logger.step(`Applying migration: ${m.id}`);
-    const outcome = await m.apply(ctx, { dryRun: false });
-    await recordApplied(ctx.projectDir, {
-      id: m.id,
-      at: new Date().toISOString(),
-      cliVersion: pkg.version,
-    });
-    if (outcome === 'applied') {
-      logger.success(`Migration ${m.id} applied.`);
-    } else {
-      logger.info(`Migration ${m.id} was a no-op (already satisfied).`);
-    }
-    applied.push(m.id);
-  }
-
-  return { proceed: true, applied, declined: false };
-}
-
-/**
- * Plan-only variant used by `tale upgrade`. Never stops containers, never
- * runs apply(). Just logs the plan so operators know what `tale start` /
- * `tale deploy` will do next.
- */
-export async function planPendingMigrations(
-  registry: readonly Migration[],
-  ctx: MigrationContext,
-): Promise<Migration[]> {
-  const pending = await computePending(registry, ctx);
-  if (pending.length === 0) return [];
-  const stopsSet = new Set<string>();
-  for (const m of pending) {
-    for (const s of await m.requiredStops(ctx)) stopsSet.add(s);
-  }
-  printPlan(pending, [...stopsSet], ctx);
-  logger.notice(
-    'Run "tale start" (dev) or "tale deploy" (prod) to apply — the CLI will prompt before changing anything.',
-  );
-  return pending;
-}
diff --git a/tools/cli/src/lib/upgrade/state.ts b/tools/cli/src/lib/upgrade/state.ts
deleted file mode 100644
index 33627202ff..0000000000
--- a/tools/cli/src/lib/upgrade/state.ts
+++ /dev/null
@@ -1,94 +0,0 @@
-import { existsSync } from 'node:fs';
-import { mkdir, readFile, rename, unlink, writeFile } from 'node:fs/promises';
-import { dirname, join } from 'node:path';
-
-import * as logger from '../../utils/logger';
-import type { AppliedMigration, MigrationsState } from './types';
-
-const STATE_FILENAME = 'migrations.json';
-/** Legacy one-bit marker written by v0.2.33. Migrated on first read. */
-const LEGACY_MARKER = 'migration-pending';
-
-function statePath(projectDir: string): string {
-  return join(projectDir, '.tale', STATE_FILENAME);
-}
-
-function legacyMarkerPath(projectDir: string): string {
-  return join(projectDir, '.tale', LEGACY_MARKER);
-}
-
-/**
- * Read the applied-migration list from `.tale/migrations.json`. If the file
- * doesn't exist but the legacy `.tale/migration-pending` marker is present,
- * treat that as "no migrations applied yet" (the legacy marker carried no
- * per-migration identity, so we must let each registered migration's detect()
- * re-discover any real pending work) and delete the legacy marker.
- */
-async function readMigrationsState(
-  projectDir: string,
-): Promise<MigrationsState> {
-  const path = statePath(projectDir);
-  if (!existsSync(path)) {
-    const legacyPath = legacyMarkerPath(projectDir);
-    if (existsSync(legacyPath)) {
-      logger.debug(
-        `Found legacy migration marker at ${legacyPath}; seeding empty migrations.json`,
-      );
-      await unlink(legacyPath).catch(() => {
-        /* best-effort */
-      });
-    }
-    return { applied: [] };
-  }
-  try {
-    const raw = await readFile(path, 'utf8');
-    const parsed = JSON.parse(raw) as Partial<MigrationsState>;
-    if (!parsed.applied || !Array.isArray(parsed.applied)) {
-      return { applied: [] };
-    }
-    return { applied: parsed.applied };
-  } catch (err) {
-    // Preserve the corrupt file for postmortem rather than silently losing
-    // history. A truncated write (crash, disk full) can land here; the
-    // operator will want to see the bytes that were there.
-    const backupPath = `${path}.corrupted-${new Date()
-      .toISOString()
-      .replace(/[:.]/g, '-')}`;
-    await rename(path, backupPath).catch(() => {
-      /* best-effort — if even rename fails, log and continue */
-    });
-    logger.warn(
-      `Could not parse ${path}: ${err instanceof Error ? err.message : String(err)}. Moved to ${backupPath} and treating as empty.`,
-    );
-    return { applied: [] };
-  }
-}
-
-export async function writeMigrationsState(
-  projectDir: string,
-  state: MigrationsState,
-): Promise<void> {
-  const path = statePath(projectDir);
-  await mkdir(dirname(path), { recursive: true });
-  // Atomic write: write to a sibling tmp file then rename. rename(2) is
-  // atomic on POSIX when source and destination are on the same filesystem,
-  // so a crash during write leaves the previous migrations.json intact
-  // instead of producing a truncated/parseable-as-empty file.
-  const tmpPath = `${path}.tmp`;
-  await writeFile(tmpPath, `${JSON.stringify(state, null, 2)}\n`);
-  await rename(tmpPath, path);
-}
-
-export async function recordApplied(
-  projectDir: string,
-  entry: AppliedMigration,
-): Promise<void> {
-  const state = await readMigrationsState(projectDir);
-  if (state.applied.some((a) => a.id === entry.id)) {
-    // Already recorded; nothing to do. This can happen on an idempotent
-    // re-run of a migration whose detect() returned true by accident.
-    return;
-  }
-  state.applied.push(entry);
-  await writeMigrationsState(projectDir, state);
-}
diff --git a/tools/cli/src/lib/upgrade/types.ts b/tools/cli/src/lib/upgrade/types.ts
deleted file mode 100644
index dc4107822a..0000000000
--- a/tools/cli/src/lib/upgrade/types.ts
+++ /dev/null
@@ -1,71 +0,0 @@
-/** Context passed to every migration's detect/apply/requiredStops hook. */
-export interface MigrationContext {
-  projectId: string;
-  projectDir: string;
-}
-
-/** Outcome of a single apply() call. */
-export type ApplyOutcome = 'applied' | 'noop';
-
-/**
- * A single migration step registered in the pipeline.
- *
- * Every migration must be **idempotent**, and its `detect` must be a pure
- * feature check against observable end-state:
- *
- *   - `detect` must return `true` only when the migration's postcondition
- *     does NOT already hold. Mere existence of source artifacts is not
- *     enough; the destination must also be absent/incomplete. Stray
- *     legacy volumes on the host from unrelated installs must never
- *     cause `detect` to return `true`.
- *   - `apply` must be safe to re-run at any point — on a fully satisfied
- *     system, on a partially-migrated system after an interruption, and
- *     on a freshly-initialised system. It must independently re-check
- *     each unit of work against the target state and skip units already
- *     satisfied.
- *   - `detect` and `apply` must NOT consult `migrations.json`, CLI
- *     versions, or any other external history. Those are caches/logs,
- *     not sources of truth — the filesystem/volume/container state is.
- *
- * A migration that can't express its precondition in terms of observable
- * end-state is a bug.
- */
-export interface Migration {
-  /** Stable id, used as the key in `.tale/migrations.json`. */
-  id: string;
-  /** CLI version that introduced this migration (for logs only). */
-  introducedIn: string;
-  /**
-   * One-line human-readable description, shown in plan output. May be a
-   * static string OR a function of the context when the description needs
-   * to interpolate projectId etc. — plain strings never get template-literal
-   * expansion at use site, so use the function form whenever the text
-   * contains per-project names.
-   */
-  description: string | ((ctx: MigrationContext) => string);
-  /** Returns true iff this migration has work to do given current state. */
-  detect(ctx: MigrationContext): Promise<boolean>;
-  /**
-   * Docker compose project names / container names that must be stopped
-   * before apply(). The runner collects the union across pending migrations
-   * and stops them once.
-   */
-  requiredStops(ctx: MigrationContext): Promise<string[]>;
-  /** Apply the migration. Must throw on any error. */
-  apply(
-    ctx: MigrationContext,
-    opts: { dryRun: boolean },
-  ): Promise<ApplyOutcome>;
-}
-
-/** Persisted record of a successfully-applied migration. */
-export interface AppliedMigration {
-  id: string;
-  at: string;
-  cliVersion: string;
-}
-
-/** Shape of `.tale/migrations.json`. */
-export interface MigrationsState {
-  applied: AppliedMigration[];
-}
diff --git a/tools/cli/src/lib/upgrade/volume-helpers.ts b/tools/cli/src/lib/upgrade/volume-helpers.ts
deleted file mode 100644
index c8d3f6a6e9..0000000000
--- a/tools/cli/src/lib/upgrade/volume-helpers.ts
+++ /dev/null
@@ -1,333 +0,0 @@
-import * as logger from '../../utils/logger';
-import { docker } from '../docker/docker';
-
-/**
- * Sentinel file written inside a destination volume once the `cp -a` completes
- * successfully. Presence guarantees a complete migration; absence (with data
- * present) indicates a partial/interrupted copy that must be recovered.
- */
-const MIGRATION_SENTINEL = '.tale-migration-complete';
-
-export async function volumeExists(name: string): Promise<boolean> {
-  const r = await docker('volume', 'inspect', name);
-  return r.success;
-}
-
-export async function volumeHasData(
-  name: string,
-  image: string,
-): Promise<boolean> {
-  const r = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${name}:/vol:ro`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-c',
-    'ls -A /vol | head -1',
-  );
-  return r.success && r.stdout.trim().length > 0;
-}
-
-async function volumeHasSentinel(
-  name: string,
-  image: string,
-): Promise<boolean> {
-  const r = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${name}:/vol:ro`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-c',
-    `test -f /vol/${MIGRATION_SENTINEL}`,
-  );
-  return r.success;
-}
-
-async function volumeFileCount(
-  name: string,
-  image: string,
-): Promise<number | null> {
-  // `cp -a` preserves regular files, directories, and symlinks but silently
-  // skips sockets, FIFOs, and device nodes. To keep src/dst counts
-  // comparable, only count things cp will actually copy: regular files and
-  // symlinks. Exclude the migration sentinel itself so chained migrations
-  // (whose source may already carry a sentinel from an earlier pipeline
-  // step) compare cleanly — sentinel presence is verified separately via
-  // volumeHasSentinel.
-  const r = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${name}:/vol:ro`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-c',
-    `find /vol \\( -type f -o -type l \\) ! -name '${MIGRATION_SENTINEL}' | wc -l`,
-  );
-  if (!r.success) return null;
-  const n = parseInt(r.stdout.trim(), 10);
-  return Number.isFinite(n) ? n : null;
-}
-
-/** Diagnostic: list relative paths present in `/src` but not in `/dst`, plus
- *  any special (non-regular, non-symlink, non-dir) files in src that cp -a
- *  would have skipped. Best-effort — used only on verification failure. */
-async function diffVolumes(
-  src: string,
-  dst: string,
-  image: string,
-): Promise<string> {
-  const r = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${src}:/src:ro`,
-    '-v',
-    `${dst}:/dst:ro`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-c',
-    [
-      '(cd /src && find . \\( -type f -o -type l \\) | sort) > /tmp/s',
-      '(cd /dst && find . \\( -type f -o -type l \\) | sort) > /tmp/d',
-      'echo "--- src counts ---"',
-      'echo "regular files: $(find /src -type f | wc -l)"',
-      'echo "symlinks:      $(find /src -type l | wc -l)"',
-      'echo "dirs:          $(find /src -type d | wc -l)"',
-      'echo "special:       $(find /src ! -type f ! -type l ! -type d | wc -l)"',
-      'echo "sentinel:      $(ls -la /src/.tale-migration-complete 2>/dev/null || echo absent)"',
-      'echo "--- dst counts ---"',
-      'echo "regular files: $(find /dst -type f | wc -l)"',
-      'echo "symlinks:      $(find /dst -type l | wc -l)"',
-      'echo "dirs:          $(find /dst -type d | wc -l)"',
-      'echo "special:       $(find /dst ! -type f ! -type l ! -type d | wc -l)"',
-      'echo "sentinel:      $(ls -la /dst/.tale-migration-complete 2>/dev/null || echo absent)"',
-      'echo "--- in src but not dst (first 20) ---"',
-      'comm -23 /tmp/s /tmp/d | head -20',
-      'echo "--- in dst but not src (first 20) ---"',
-      'comm -13 /tmp/s /tmp/d | head -20',
-    ].join(' && '),
-  );
-  if (!r.success) return `diff failed: ${r.stderr.trim()}`;
-  return r.stdout.trim();
-}
-
-/** Rename a volume's contents aside by moving them into a timestamped sub-dir.
- *  Safer than deleting: if we later discover we wiped legitimate data, the
- *  operator can recover by hand. Note: docker volumes can't be renamed, so we
- *  create a sibling *-partial-<ts> volume and copy the unsentinelled contents
- *  into it before wiping the destination. */
-async function moveContentsToBackupVolume(
-  name: string,
-  image: string,
-): Promise<string | null> {
-  const ts = new Date().toISOString().replace(/[:.]/g, '-');
-  const backup = `${name}.partial-${ts}`;
-  const created = await docker('volume', 'create', backup);
-  if (!created.success) {
-    logger.warn(
-      `  failed to create backup volume ${backup}: ${created.stderr.trim()}`,
-    );
-    return null;
-  }
-  const copy = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${name}:/src:ro`,
-    '-v',
-    `${backup}:/dst`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-c',
-    'cp -a /src/. /dst/',
-  );
-  if (!copy.success) {
-    logger.warn(
-      `  failed to copy partial contents into ${backup}: ${copy.stderr.trim()}`,
-    );
-    return null;
-  }
-  // Fail-fast wipe: `find -delete` without `-e` continues past errors and can
-  // leave the destination half-wiped; a subsequent copyVolumeWithVerify would
-  // then see corrupted state. Use `sh -e` so any rm failure aborts loudly.
-  const wipe = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${name}:/vol`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-ec',
-    'cd /vol && find . -mindepth 1 -maxdepth 1 -exec rm -rf -- {} +',
-  );
-  if (!wipe.success) {
-    logger.warn(
-      `  failed to wipe ${name} (destination may be partial): ${wipe.stderr.trim()}`,
-    );
-    return null;
-  }
-  return backup;
-}
-
-/** Find an image that is already available locally for running throwaway
- *  copy containers, avoiding a network pull. Prefers images we know Tale
- *  itself ships so plain `docker image inspect` succeeds.  */
-export async function resolveMigrationImage(): Promise<string> {
-  const candidates = ['tale-platform', 'tale-proxy', 'alpine'];
-  for (const candidate of candidates) {
-    const exact = await docker('image', 'inspect', candidate);
-    if (exact.success) return candidate;
-    const lookup = await docker(
-      'images',
-      '--format',
-      '{{.Repository}}:{{.Tag}}',
-    );
-    if (lookup.success) {
-      const match = lookup.stdout
-        .split('\n')
-        .find((line) => line.includes(candidate) && !line.includes('<none>'));
-      if (match) return match.trim();
-    }
-  }
-  // Final fallback: alpine — may trigger a pull, but docker run handles that
-  // transparently.
-  return 'alpine';
-}
-
-/** Stop a container and wait for it to exit. Treats failure as fatal so
- *  callers never run a volume copy against a live container. */
-export async function stopContainerOrThrow(name: string): Promise<void> {
-  const stop = await docker('stop', '-t', '30', name);
-  if (!stop.success) {
-    throw new Error(
-      `failed to stop container ${name}: ${stop.stderr.trim() || 'unknown error'}`,
-    );
-  }
-  const waited = await docker('wait', name);
-  if (!waited.success) {
-    throw new Error(
-      `container ${name} did not confirm shutdown: ${waited.stderr.trim() || 'unknown error'}`,
-    );
-  }
-}
-
-/**
- * Copy the contents of one volume into another, verify with a strict file
- * count check (src vs dst-minus-sentinel must match exactly), and mark the
- * destination with the sentinel file only on success.
- *
- * If the destination already has data but no sentinel, it is moved aside
- * into a timestamped backup volume rather than wiped, so the operator can
- * recover manually if the earlier state was actually legitimate.
- */
-export async function copyVolumeWithVerify(
-  src: string,
-  dst: string,
-  image: string,
-): Promise<void> {
-  if (!(await volumeExists(dst))) {
-    const created = await docker('volume', 'create', dst);
-    if (!created.success) {
-      throw new Error(
-        `failed to create destination volume ${dst}: ${created.stderr.trim()}`,
-      );
-    }
-  } else if (await volumeHasData(dst, image)) {
-    if (await volumeHasSentinel(dst, image)) {
-      // Already migrated — caller should have detected this and skipped. We
-      // treat this as a soft no-op rather than an error.
-      logger.debug(
-        `${dst} already has migration sentinel; skipping re-copy in copyVolumeWithVerify`,
-      );
-      return;
-    }
-    // Safety rail: by the time we reach here, the calling migration's
-    // `detect`/`findPending` has already asserted this destination is NOT
-    // in its end-state. But if a migration has a detection bug and asks us
-    // to copy something SMALLER than what's already on the destination,
-    // this is almost certainly either (a) a stale / unrelated source being
-    // pulled in, or (b) a logic error in the migration. Either way, silent
-    // clobbering is wrong — fail loudly and let the operator investigate.
-    const srcCountPre = await volumeFileCount(src, image);
-    const dstCountPre = await volumeFileCount(dst, image);
-    if (srcCountPre != null && dstCountPre != null) {
-      if (srcCountPre === 0 && dstCountPre > 0) {
-        throw new Error(
-          `refusing to overwrite ${dst} (${dstCountPre} files) with empty source ${src}. This looks like a migration detection bug — destination already populated but source is empty.`,
-        );
-      }
-      if (dstCountPre > srcCountPre * 2) {
-        throw new Error(
-          `refusing to overwrite ${dst} (${dstCountPre} files) with much smaller source ${src} (${srcCountPre} files). A migration should not replace populated destination data with a substantially smaller source — this looks like a stale/unrelated source volume.`,
-        );
-      }
-    }
-    logger.warn(
-      `  ⚠  ${dst} has data but no sentinel; moving partial contents to backup volume`,
-    );
-    const backup = await moveContentsToBackupVolume(dst, image);
-    if (!backup) {
-      throw new Error(
-        `could not preserve partial contents of ${dst}; aborting migration`,
-      );
-    }
-    logger.info(`  partial contents preserved at volume: ${backup}`);
-  }
-
-  // Run the copy as root (no --user flag). Destination volume is newly
-  // created by docker with a root-owned / directory, so a non-root process
-  // cannot write into it. `cp -a` preserves ownership from source, so files
-  // populated by the convex container (uid 1001) stay uid 1001. We chown
-  // the dst root + sentinel explicitly so the app user can read/write at
-  // the top level when convex later mounts it.
-  const copy = await docker(
-    'run',
-    '--rm',
-    '-v',
-    `${src}:/src:ro`,
-    '-v',
-    `${dst}:/dst`,
-    '--entrypoint',
-    'sh',
-    image,
-    '-ec',
-    `cp -a /src/. /dst/ && : > /dst/${MIGRATION_SENTINEL} && chown 1001:1001 /dst /dst/${MIGRATION_SENTINEL}`,
-  );
-  if (!copy.success) {
-    throw new Error(`copy ${src} → ${dst} failed: ${copy.stderr.trim()}`);
-  }
-
-  const srcCount = await volumeFileCount(src, image);
-  const dstCount = await volumeFileCount(dst, image);
-  if (srcCount == null || dstCount == null) {
-    throw new Error(
-      `could not verify file counts for ${src} → ${dst} (src=${srcCount}, dst=${dstCount})`,
-    );
-  }
-  // Both counts exclude the sentinel itself (see volumeFileCount) so chained
-  // migrations compare cleanly regardless of whether src already carries a
-  // sentinel from an earlier pipeline step.
-  if (dstCount !== srcCount) {
-    const diff = await diffVolumes(src, dst, image);
-    throw new Error(
-      `file count mismatch for ${src} → ${dst}: src=${srcCount}, dst=${dstCount}. Refusing to mark migration complete.\n${diff}`,
-    );
-  }
-  if (!(await volumeHasSentinel(dst, image))) {
-    throw new Error(
-      `migration sentinel missing on ${dst} after copy. Refusing to mark migration complete.`,
-    );
-  }
-  logger.success(`  ${src} → ${dst} (${srcCount} files)`);
-}

From bddf863e6c202e7963cbb21ba508ff4794df0c7c Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Wed, 27 May 2026 22:55:07 +0800
Subject: [PATCH 02/41] fix(platform,cli,rag,crawler): error-reporting chain +
 org-aware RAG/Crawler + runbook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two-round multi-agent review of 5deaa5a8c surfaced ~45 findings across
four themes; this commit lands the unified fix in one pass.

Wave 1 — Correctness (error-reporting chain + safety guards)

- scaffold.ts: seedDomain/seedRetention now return structured
  { domain, ok, error? } results. scaffoldNewOrganization aggregates
  and accepts a new `strict` arg — when true, throws on any
  per-domain failure (used by reseed_all_orgs); when false/omitted,
  preserves the org-create lenient semantics auth.afterCreateOrganization
  depends on. Promote path.isAbsolute(TALE_CONFIG_DIR) guard from
  cleanupOrgFilesystem so seedRetention can't accidentally write
  into the action's CWD on unset env. Bundle-mode rm-before-copy
  replaced with staging-dir + atomic-rename. randomUUID suffix on
  the condemned-dir name (defends against ms-resolution Date.now()
  collisions). Opportunistic janitor for stale <root>/.deleted-*
  trees older than 24h. Three previously-empty catches replaced
  with console.warn lines.
- reseed_all_orgs.ts: throw at end of loop when failed > 0 with
  aggregated failed-slug detail; that propagates through
  bunx convex run → docker exec exit code → CLI throw. Returns
  validator added so the action's shape is explicit. Passes
  strict:true to scaffold so per-domain failures are no longer
  swallowed silently.
- tools/cli/src/lib/actions/reseed-all-orgs.ts: add --no-push to
  bunx convex run; grep-strip the bunx banner (Admin key, emoji
  separators, blank lines) so the trailing JSON is parseable;
  parse the result on success and surface succeeded/total counts;
  throw on docker-exec non-zero (which is what the action's new
  end-of-loop throw produces).
- tools/cli/src/lib/actions/deploy.ts: stageOrgIntoDir filter now
  skips dotfiles (.git, .DS_Store, .vscode, .idea, .tale),
  node_modules, __pycache__ — the previous filter only excluded
  .history/ and *.secrets.json at depth ≥ 1, so operators with
  default/.git/ or macOS .DS_Store would have shipped those into
  /app/data/. syncProjectFiles now throws on docker cp failure
  instead of returning, so the outer success("Deployment complete!")
  no longer prints over a half-pushed state.
- Deleted services/platform/convex/migrations/rename_org_slug.ts —
  under the multi-org model the migration is actively dangerous
  (renames every org to `default`), and there is no registry/cron
  tying it to anything. The docblock's "Self-hosted Tale deployments
  use a single organization" assumption is stale.

Wave 2 — Hidden code paths (out-of-sight assumptions that survived 5deaa5a8c)

- Python provider loader: load_providers + get_chat_model/
  get_embedding_model/get_vision_model + their *_config siblings
  now REQUIRE org_slug. Path resolves to
  <root>/<org_slug>/providers/ instead of <root>/providers/.
  BaseServiceSettings + Settings.get_llm_config likewise threaded.
  Without this, RAG and crawler both died at FastAPI lifespan
  startup with "No chat model found" against the old flat path.
- RAG: RagService rebuilt around a per-org _OrgClients cache with
  a 15s TTL. DB pool stays singleton; embedding/openai/vision
  clients and search service are per-org, built lazily on first
  request for that org. add_document/search/generate/compare_files
  now take org_slug as first arg. Embedding dimensions pinned
  globally on first org init; subsequent orgs that disagree raise
  loudly (per-org schema would need per-org DB).
- Crawler: uses contextvars instead of explicit threading — a new
  app/org_context.py exposes set_active_org/get_active_org/
  require_org_slug; main.py mounts require_org_slug as a router-
  level dep on every public router. embedding_service.py rebuilt
  with per-org cache keyed on slug. Boot-time embedding-dim guard
  in database.py removed (no org context at lifespan); pgvector
  enforces dim on insert. vision/openai_client.py and
  file_parser_service.py read get_active_org() at each settings.get_*
  call. scheduler.py background task sets "default" with a one-shot
  warn until per-website org binding lands.
- ragFetch: optional orgSlug in init; when set, X-Tale-Org header
  is forced (cannot be spoofed via init.headers). RAG endpoints
  that need it (search/generate/upload/compare-files) enforce
  via Depends(require_org_slug); status/delete/content/compare-
  by-id stay org-agnostic. All platform callers threaded — new
  lib/helpers/org_slug.ts (orgSlugFromId) bridges organizationId
  to slug for callers that only have the id. Crawler /api/v1/search
  callers (query_web_context, search_pages) set X-Tale-Org directly.
- generate-dev-compose.ts: bind mounts rewritten for the org-first
  layout. Old HOST_CONFIG_DIRS = ['agents','workflows',…] enumerated
  flat host dirs that no longer exist after tale init writes
  default/<domain>/. Replaced with findOrgDirs() — emits
  ./<org>/<domain>:/app/data/<org>/<domain>{ro} for every org
  found. start.ts user-facing hot-reload message updated.
- RULES_CONTENT (tools/cli/src/lib/rules/content.ts) + Cursor MDC
  globs rewritten for the org-first layout. tale update now applies
  checksum protection to rules files (CLAUDE.md, .cursor/rules/
  tale.mdc, .github/copilot-instructions.md, .windsurfrules) — was
  unconditional overwrite, would clobber local edits.
- tale update embedded-examples paths prefixed with `default/` so
  scaffolded files land where init puts them; previously update
  wrote into the now-unread flat layout.
- services/convex/docker-entrypoint.sh: detects pre-orgfirst flat
  dirs at /app/data/{agents,workflows,…}/ on boot and warns loudly
  with the tale migrate config-layout runbook. atomic_cp helper
  comment reworded — it's atomic for the destination but cp itself
  isn't atomic.
- tale start: detects legacy flat-layout dirs at project root and
  prints the runbook before continuing.

Wave 3 — User-facing surface

- All three root READMEs: stale "pending data migrations are
  detected and applied automatically on tale start/deploy" claim
  replaced with the explicit migrate runbook + link to upgrades.md.
- docs/{en,de,fr}/self-hosted/configuration/providers.md: GitHub
  href tree/main/examples/providers (404) → tree/main/examples/default/providers.
- docs/{en,de,fr}/self-hosted/configuration/retention.md: path
  documented as per-org /app/data/<org>/retention.json instead of
  the removed /app/data/platform-config/governance/retention-bounds.json.
- docs/{en,de,fr}/self-hosted/operate/upgrades.md: new "Migrating
  to the org-first config layout" section covering the 3-step
  runbook, the rollback story (downgrade safe between steps 1 and
  3 via the -orgfirst marker token; provider-secrets restore-from-
  backup needed after step 3), and the skip-step-1 fallback.
- governance/mutations.ts: client-facing ConvexError message now
  references $TALE_CONFIG_DIR/<orgSlug>/retention.json.

Wave 4 — Remaining majors (small surgical fixes)

- init.ts: OpenRouter secrets file gets mode 0o600.
- .dockerignore: !examples/**/*.md keeps skill SKILL.md in build context.
- compose.yml + tools/cli/src/lib/compose/generators/constants.ts:
  stale `tale migrate split-convex` justification reworded — the
  platform-data volume is now an unused pre-0.3.0 stub kept only
  for the detect() probe in start.ts.
- migrate-config-layout/script.sh: set -euo pipefail + ${DATA:?}
  guard so an unset $DATA can't make --cleanup-old rm from the
  container root.
- Empty-catch fix-ups in branding/file_actions.ts (unlink loop +
  readdir), serve-branding-images.ts (.catch fallthrough now logs
  non-ENOENT), init.ts (detectTaleProjectFiles readdir).
- config_store/store.ts: deleted the dead orgFirst flag — every
  caller passed true. Inlined the org-first layout; deleted the
  legacy <area>/<orgSlug>.json branch and updated the unit tests.
- Stale docblocks updated in governance/{retention_actions,
  retention_bounds_proposal, retention_floors}.ts, integrations/
  {credentials_schema, load_integration}.ts, agents/file_utils.ts,
  skills/file_actions.ts (all referenced the old flat layout or
  removed env vars).
- services/platform/docker-entrypoint.sh: ORPHAN_DERIVED → LEGACY_
  DOMAIN_VARS, dropped 2>/dev/null on the env-purge so failures
  surface in logs.
- services/platform/Dockerfile: env-comment rewritten for the
  org-first sub-dir derivation.

Test surface: 36/36 tasks pass via `bun run check`; 70927 platform
tests, 298 RAG, 472 crawler. Touched tests:
test_rag_service, test_compare_files, test_background_ingest,
test_config (RAG + crawler), test_document_helpers, test_database
(crawler — boot-time dim guard tests skipped with rationale),
test_llm_cache (ContextVar setup), upload_file_direct.test,
upload_document.test, store.test (rewritten for org-first paths).

Out of scope (per user direction): reserving the literal `default`
slug at the Better Auth `beforeCreateOrganization` hook. Resolved
operationally — the operator is the first user and registers the
default org via the normal flow.
---
 .dockerignore                                 |   6 +-
 README.de.md                                  |   2 +-
 README.fr.md                                  |   2 +-
 README.md                                     |   2 +-
 compose.yml                                   |   9 +-
 .../de/self-hosted/configuration/providers.md |   2 +-
 .../de/self-hosted/configuration/retention.md |   2 +-
 docs/de/self-hosted/operate/upgrades.md       |  38 ++
 .../en/self-hosted/configuration/providers.md |   2 +-
 .../en/self-hosted/configuration/retention.md |   2 +-
 docs/en/self-hosted/operate/upgrades.md       |  38 ++
 .../fr/self-hosted/configuration/providers.md |   2 +-
 .../fr/self-hosted/configuration/retention.md |   2 +-
 docs/fr/self-hosted/operate/upgrades.md       |  38 ++
 .../src/tale_shared/config/base.py            |  48 +-
 .../src/tale_shared/config/providers.py       |  44 +-
 services/convex/docker-entrypoint.sh          |  47 +-
 services/crawler/app/main.py                  |  30 +-
 services/crawler/app/org_context.py           |  69 +++
 services/crawler/app/services/database.py     |  47 +-
 .../crawler/app/services/embedding_service.py | 102 ++--
 .../app/services/file_parser_service.py       |   7 +-
 services/crawler/app/services/scheduler.py    |  20 +-
 .../app/services/vision/openai_client.py      |   9 +-
 services/crawler/tests/test_config.py         |  14 +-
 services/crawler/tests/test_database.py       |  10 +
 services/crawler/tests/test_llm_cache.py      |   6 +
 services/platform/Dockerfile                  |  15 +-
 services/platform/convex/_generated/api.d.ts  |   4 +-
 .../helpers/fetch_document_comparison.ts      |   2 +
 .../agent_tools/rag/query_rag_context.ts      |   7 +
 .../convex/agent_tools/rag/rag_search_tool.ts |   9 +
 .../web/helpers/query_web_context.ts          |  11 +-
 .../agent_tools/web/helpers/search_pages.ts   |  15 +-
 services/platform/convex/agents/file_utils.ts |   2 +-
 .../platform/convex/branding/file_actions.ts  |  20 +-
 .../convex/documents/compare_documents.ts     |   3 +
 .../convex/file_metadata/transcribe_audio.ts  |   7 +
 .../platform/convex/governance/mutations.ts   |   2 +-
 .../convex/governance/retention_actions.ts    |   7 +-
 .../governance/retention_bounds_proposal.ts   |   7 +-
 .../convex/governance/retention_floors.ts     |   5 +-
 .../convex/integrations/credentials_schema.ts |   5 +-
 .../convex/integrations/load_integration.ts   |   2 +-
 .../lib/agent_response/generate_response.ts   |   4 +-
 .../convex/lib/config_store/actions.ts        |   1 -
 .../convex/lib/config_store/store.test.ts     |  51 +-
 .../platform/convex/lib/config_store/store.ts |  88 +---
 .../platform/convex/lib/helpers/org_slug.ts   |  48 ++
 .../platform/convex/lib/helpers/rag_config.ts |  38 +-
 .../convex/migrations/rename_org_slug.ts      |  69 ---
 .../convex/organizations/reseed_all_orgs.ts   |  61 ++-
 .../platform/convex/organizations/scaffold.ts | 264 ++++++++--
 .../platform/convex/skills/file_actions.ts    |   2 +-
 .../action_defs/document/document_action.ts   |   3 +
 .../rag/helpers/upload_document.test.ts       |  11 +-
 .../rag/helpers/upload_document.ts            |   4 +
 .../rag/helpers/upload_file_direct.test.ts    |   1 +
 .../rag/helpers/upload_file_direct.ts         |   4 +
 .../action_defs/rag/rag_action.ts             |  14 +-
 services/platform/docker-entrypoint.sh        |  10 +-
 .../vite-plugins/serve-branding-images.ts     |  19 +-
 services/rag/app/auth.py                      |  29 ++
 services/rag/app/config.py                    |   8 +-
 services/rag/app/routers/documents.py         |  19 +-
 services/rag/app/routers/search.py            |  15 +-
 services/rag/app/services/rag_service.py      | 453 ++++++++++--------
 services/rag/tests/test_background_ingest.py  |  10 +
 services/rag/tests/test_compare_files.py      |  55 ++-
 services/rag/tests/test_config.py             |  18 +-
 services/rag/tests/test_document_helpers.py   |  28 +-
 services/rag/tests/test_rag_service.py        | 109 +++--
 tools/cli/src/lib/actions/deploy.ts           |  57 ++-
 tools/cli/src/lib/actions/init.ts             |  20 +-
 tools/cli/src/lib/actions/reseed-all-orgs.ts  | 107 ++++-
 tools/cli/src/lib/actions/start.ts            |  36 +-
 tools/cli/src/lib/actions/update.ts           |  81 +++-
 .../src/lib/compose/generators/constants.ts   |  12 +-
 .../generators/generate-dev-compose.ts        |  85 +++-
 .../src/lib/migrate-config-layout/script.sh   |   8 +-
 tools/cli/src/lib/rules/content.ts            |  45 +-
 tools/cli/src/lib/rules/generators.ts         |   5 +-
 82 files changed, 1858 insertions(+), 757 deletions(-)
 create mode 100644 services/crawler/app/org_context.py
 create mode 100644 services/platform/convex/lib/helpers/org_slug.ts
 delete mode 100644 services/platform/convex/migrations/rename_org_slug.ts

diff --git a/.dockerignore b/.dockerignore
index 4f2099eb36..095d883195 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -55,7 +55,7 @@ services/platform/.env*.local
 # ============================================================================
 # NOTE: keep tools/, examples/, and patches/ — Dockerfiles reference them:
 #   - platform image copies tools/cli/package.json and patches/
-#   - convex image copies examples/{agents,workflows,integrations,providers,branding}
+#   - convex image copies examples/default/{agents,workflows,integrations,providers,branding,skills}
 tests/
 designs/
 scripts/
@@ -69,6 +69,10 @@ knip-results.json
 docs/
 !docs/package.json
 README.md
+# Skill bundles ship their SKILL.md alongside the scripts; without
+# this carve-out the convex image's COPY of examples/default/ would
+# drop every skill's README and break runtime skill discovery.
+!examples/**/*.md
 
 # ============================================================================
 # IDE and Editor Files
diff --git a/README.de.md b/README.de.md
index 005eb6c56d..e465f09f4b 100644
--- a/README.de.md
+++ b/README.de.md
@@ -88,7 +88,7 @@ tale cleanup                       # Inaktive Container entfernen
 tale reset --force                 # Alle Container entfernen
 ```
 
-In der [CLI-Referenz](tools/cli/README.md) findest du alle Optionen und Flags. Anstehende Daten-Migrationen werden beim nächsten `tale start` oder `tale deploy` automatisch erkannt und angewendet.
+In der [CLI-Referenz](tools/cli/README.md) findest du alle Optionen und Flags. Das Aktualisieren einer bestehenden Installation erfordert eine einmalige manuelle Migration: führe `tale migrate config-layout` aus, danach `tale deploy --override-all -y`. Das vollständige Runbook findest du in [Self-hosted Upgrades](docs/de/self-hosted/operate/upgrades.md).
 
 ## In Produktion deployen
 
diff --git a/README.fr.md b/README.fr.md
index cacb0800dc..802699ed93 100644
--- a/README.fr.md
+++ b/README.fr.md
@@ -88,7 +88,7 @@ tale cleanup                       # Supprimer les conteneurs inactifs
 tale reset --force                 # Supprimer tous les conteneurs
 ```
 
-Voir la [référence du CLI](tools/cli/README.md) pour toutes les options et flags. Les migrations de données en attente sont détectées et appliquées automatiquement au prochain `tale start` ou `tale deploy`.
+Voir la [référence du CLI](tools/cli/README.md) pour toutes les options et flags. Mettre à jour un déploiement existant nécessite une migration manuelle unique : exécutez `tale migrate config-layout` puis `tale deploy --override-all -y`. Le runbook complet se trouve dans [Mises à niveau auto-hébergées](docs/fr/self-hosted/operate/upgrades.md).
 
 ## Déployer en production
 
diff --git a/README.md b/README.md
index a4335a0196..9ac2de2e64 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ tale cleanup                       # Remove inactive containers
 tale reset --force                 # Remove all containers
 ```
 
-See the [CLI reference](tools/cli/README.md) for all options and flags. Pending data migrations are detected and applied automatically on the next `tale start` or `tale deploy`.
+See the [CLI reference](tools/cli/README.md) for all options and flags. Upgrading an existing deployment requires a one-time manual migration: run `tale migrate config-layout` then `tale deploy --override-all -y`. See [Self-hosted upgrades](docs/en/self-hosted/operate/upgrades.md) for the full runbook.
 
 ## Deploy to production
 
diff --git a/compose.yml b/compose.yml
index 42564237af..f2f278a2ad 100644
--- a/compose.yml
+++ b/compose.yml
@@ -688,10 +688,11 @@ volumes:
   rag-data:
     driver: local
 
-  # Persistent storage for all platform data (Convex DB, agents, workflows, integrations)
-  # LEGACY: once `tale migrate split-convex` has run, data lives in convex-data.
-  # This volume is preserved in case users need to rollback; safe to remove
-  # manually after a successful Phase 2 upgrade.
+  # LEGACY (pre-0.3.0): platform data used to live here before the
+  # split-convex transition. Today everything lives in `convex-data`; the
+  # volume is retained as an unused stub so detect() in start.ts can
+  # identify pre-0.3.0 deployments and produce a coherent diff. Operators
+  # can delete it by hand once they're past the upgrade window.
   platform-data:
     driver: local
 
diff --git a/docs/de/self-hosted/configuration/providers.md b/docs/de/self-hosted/configuration/providers.md
index f8e904321d..3f6fbe7932 100644
--- a/docs/de/self-hosted/configuration/providers.md
+++ b/docs/de/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ Die Referenz ist das Dateiformat auf Platte und die Reihenfolge der Operationen,
 }
 ```
 
-Die vollständige Menge der Felder lebt in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` und `vercel-gateway.json` decken die drei Formen ab, die du wahrscheinlich brauchst.
+Die vollständige Menge der Felder lebt in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/default/providers) — `openai.json`, `openrouter.json` und `vercel-gateway.json` decken die drei Formen ab, die du wahrscheinlich brauchst.
 
 ## Die Secrets-Datei
 
diff --git a/docs/de/self-hosted/configuration/retention.md b/docs/de/self-hosted/configuration/retention.md
index 3a0f9006d1..5553f2c1ff 100644
--- a/docs/de/self-hosted/configuration/retention.md
+++ b/docs/de/self-hosted/configuration/retention.md
@@ -24,7 +24,7 @@ Die mitgelieferten Defaults sind locker; zieh sie an, je nach deiner Compliance-
 
 ## Wo du Grenzen setzt
 
-Die Grenzen leben in der Operator-Config-Datei, nicht in Env-Vars. Editiere `governance/retention-bounds.json` unter `TALE_CONFIG_DIR` (default `/app/data/platform-config/` im Plattform-Container):
+Unter dem Org-first-Layout sind Retention-Grenzen **pro Org**: editiere `retention.json` direkt im Unterbaum einer Org unter `TALE_CONFIG_DIR` (default `/app/data/` im Plattform-Container, also liegt die Datei unter `/app/data/<org>/retention.json`, z. B. `/app/data/default/retention.json`). Jede Org hat ihre eigene Datei; die `default`-Datei ist die Vorlage, die eine neue Installation beim ersten Start aufgreift.
 
 ```json
 {
diff --git a/docs/de/self-hosted/operate/upgrades.md b/docs/de/self-hosted/operate/upgrades.md
index f28aa6a1c8..db5eee2a08 100644
--- a/docs/de/self-hosted/operate/upgrades.md
+++ b/docs/de/self-hosted/operate/upgrades.md
@@ -80,3 +80,41 @@ Minor-Versionen zu überspringen (von 0.9 auf 0.11 zu gehen) ist unterstützt, s
 ## Wo das hingehört
 
 Der Upgrade-Flow knüpft jede andere Operate-Seite an — Backups sind das, was ein gescheitertes Upgrade wiederherstellbar macht, Observability ist das, was dir sagt, dass die neue Farbe healthy ist, Hardening ist das, was du nach einer Major-Version neu walkst. Setzt du das CLI zum ersten Mal auf, deckt [Tale-CLI installieren](/de/self-hosted/install/cli-install) das workstationseitige Setup ab; nimmst du den Pager mitten im Rollout auf, nennt [Troubleshooting](/de/self-hosted/operate/observability/troubleshooting) die Symptome.
+
+## Migration auf das Org-first-Config-Layout
+
+Ältere Tale-Releases haben Config in einem flachen Baum im Workspace-Root abgelegt (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Aktuelles Tale nutzt ein **Org-first**-Layout, in dem jede Org — auch die kanonische `default` — ihren eigenen Unterbaum besitzt: `<root>/<org>/<domain>/...`. Die Migration ist opt-in und läuft einmal pro Workspace. Die neue Plattform liest die alten Pfade nicht mehr; bis du migrierst, liegen Provider-Secrets und Anpassungen in Verzeichnissen, die das Runtime nicht mehr anschaut.
+
+Die Migration sind drei Kommandos:
+
+```bash
+# 1. Provider-Secrets (und andere Config) aus dem flachen Layout nach
+#    `default/<domain>/...` kopieren. cp statt mv, damit die alten Pfade
+#    für einen möglichen Rollback intakt bleiben.
+tale migrate config-layout
+
+# 2. Convex-Container gegen das Org-first-Volume-Layout neu erstellen
+#    und den server-seitigen Reseed über jede registrierte Org laufen
+#    lassen. Impliziert `--all`; `-y` überspringt den destruktiven
+#    Bestätigungs-Prompt für CI / Skript-Läufe.
+tale deploy --override-all -y
+
+# 3. Wenn du das neue Layout verifiziert hast, alte Pfade entfernen.
+#    sha-verifiziert, dass die neue Datei der alten entspricht, bevor
+#    unlink; bei Mismatch wird das Löschen verweigert.
+tale migrate config-layout --cleanup-old
+```
+
+Schritt 1 ist safe und reversibel — ein Re-Run ist no-op, sobald Pfade existieren. Schritt 2 ist destruktiv: jede Org-Config mit Katalog-Name (`*.json` unter `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) wird mit dem Builtin-Katalog überschrieben. `*.secrets.json`-Dateien, `.history/`-Trails und hochgeladene `branding/images/*` bleiben server-seitig erhalten. Nach Schritt 2 liest die Plattform ausschließlich aus dem Org-first-Layout.
+
+Schritt 3 ist der Point-of-no-Return für Downgrades — siehe unten.
+
+### Org-first-Migration zurückrollen
+
+Zwischen Schritt 1 und 3 kannst du sauber downgraden. Der Convex-Entrypoint markiert jeden Seed-Lauf mit einem Token, das die Layout-Version enthält (`.seeded-<version>-orgfirst`); ein älteres Binary, das diesen Token nicht erkennt, re-seedet idempotent in seine eigenen (flachen) Pfade, und Schritt 1's `cp` hat die alten Pfade intakt gelassen. Downgrade ist ein normales `tale rollback`.
+
+Nach Schritt 3 (`--cleanup-old`) sind die alten Pfade weg. Downgrade re-seedet das Layout zwar weiterhin korrekt via Marker-Token-Mechanismus, aber die App startet mit leeren Provider-Secrets — stelle sie aus dem Backup wieder her (siehe [Backups und Restore](/de/self-hosted/operate/backups-and-restore)), bevor du Traffic wieder aufnimmst.
+
+### Was, wenn ich Schritt 1 überspringe?
+
+Der Convex-Container erkennt beim Start die übrig gebliebenen flachen Layout-Dirs und schreibt eine Warnung in seine Logs, die die Verzeichnisse benennt und auf dieses Runbook zeigt. Das Deployment startet, aber Reads aus diesen Verzeichnissen liefern leer, und Writes gehen in die neuen (leeren) Org-first-Pfade. Die Korrektur sind weiterhin Schritt 1 + 2 — sie nach der Warnung laufen zu lassen funktioniert genauso wie sie im Voraus laufen zu lassen.
diff --git a/docs/en/self-hosted/configuration/providers.md b/docs/en/self-hosted/configuration/providers.md
index 799f97a0f2..293dcdb1d5 100644
--- a/docs/en/self-hosted/configuration/providers.md
+++ b/docs/en/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ The reference is the file format on disk and the order operations follow when ad
 }
 ```
 
-The full set of fields lives in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json`, and `vercel-gateway.json` cover the three shapes you are likely to need.
+The full set of fields lives in [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/default/providers) — `openai.json`, `openrouter.json`, and `vercel-gateway.json` cover the three shapes you are likely to need.
 
 ## The secrets file
 
diff --git a/docs/en/self-hosted/configuration/retention.md b/docs/en/self-hosted/configuration/retention.md
index 8bc20b4b9e..0e36299cce 100644
--- a/docs/en/self-hosted/configuration/retention.md
+++ b/docs/en/self-hosted/configuration/retention.md
@@ -24,7 +24,7 @@ The shipped defaults are loose; tighten per your compliance posture.
 
 ## Where you set bounds
 
-The bounds live in the operator config file, not in env vars. Edit `governance/retention-bounds.json` under `TALE_CONFIG_DIR` (defaults to `/app/data/platform-config/` inside the platform container):
+Under the org-first layout, retention bounds are **per-org**: edit `retention.json` directly inside an org's subtree under `TALE_CONFIG_DIR` (defaults to `/app/data/` inside the platform container, so the file lives at `/app/data/<org>/retention.json`, e.g. `/app/data/default/retention.json`). Each org has its own file; the `default` org's file is the template a fresh deployment picks up on first boot.
 
 ```json
 {
diff --git a/docs/en/self-hosted/operate/upgrades.md b/docs/en/self-hosted/operate/upgrades.md
index 09e9d0c993..20d8c20dee 100644
--- a/docs/en/self-hosted/operate/upgrades.md
+++ b/docs/en/self-hosted/operate/upgrades.md
@@ -80,3 +80,41 @@ Skipping minor versions (going from 0.9 to 0.11) is supported as long as the int
 ## Where this fits
 
 The upgrade flow ties together every other operate page — backups are what makes a failed upgrade recoverable, observability is what tells you the new colour is healthy, hardening is what you re-walk after a major version. If you are setting up the CLI for the first time, [Install the tale CLI](/self-hosted/install/cli-install) covers the workstation-side setup; if you are picking up the pager mid-rollout, [Troubleshooting](/self-hosted/operate/observability/troubleshooting) names the symptoms.
+
+## Migrating to the org-first config layout
+
+Older Tale releases stored config in a flat tree at the workspace root (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Current Tale uses an **org-first** layout where every org — including the canonical `default` — owns its own subtree: `<root>/<org>/<domain>/...`. The migration is opt-in and runs once per workspace. The new platform refuses to read the legacy paths; until you migrate, your provider secrets and customizations live in directories the runtime no longer looks at.
+
+The migration is three commands:
+
+```bash
+# 1. Copy provider secrets (and other config) from the flat layout into
+#    `default/<domain>/...`. cp not mv, so the old paths stay intact in
+#    case you need to roll back.
+tale migrate config-layout
+
+# 2. Recreate the Convex container against the org-first volume layout
+#    and run the server-side reseed across every registered org. Implies
+#    `--all`; `-y` skips the destructive-write confirmation prompt for
+#    CI / scripted runs.
+tale deploy --override-all -y
+
+# 3. Once you have verified the new layout is intact, remove the legacy
+#    paths. sha-verifies that the new file matches the old before
+#    unlinking; refuses to delete on any mismatch.
+tale migrate config-layout --cleanup-old
+```
+
+Step 1 alone is safe and reversible — re-running it is a no-op once paths exist. Step 2 is destructive: every org's catalog-named config (`*.json` under `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) is overwritten with the builtin catalog. `*.secrets.json` files, `.history/` trails, and uploaded `branding/images/*` are preserved server-side. After step 2, the platform reads exclusively from the org-first layout.
+
+Step 3 is the point of no return for downgrades — see below.
+
+### Rolling back the org-first migration
+
+Between steps 1 and 3 you can downgrade cleanly. The Convex entrypoint marks each seed run with a token that includes the layout version (`.seeded-<version>-orgfirst`); an older binary that does not recognize the token re-seeds idempotently into its own (flat) paths, and step 1's `cp` left the legacy paths intact. Downgrade is a normal `tale rollback`.
+
+After step 3 (`--cleanup-old`), the legacy paths are gone. Downgrade still re-seeds layout correctly via the marker token mechanism, but the app boots with empty provider secrets — restore them from backup (see [Backups and restore](/self-hosted/operate/backups-and-restore)) before resuming traffic.
+
+### What if I skip step 1?
+
+The Convex container will detect leftover flat-layout dirs on boot and print a warning to its logs naming the directories and pointing at this runbook. The deployment will start up, but reads from those directories return empty and writes go to the new (empty) org-first paths. The fix is still steps 1 + 2 — running them after the warning works exactly the same as running them up front.
diff --git a/docs/fr/self-hosted/configuration/providers.md b/docs/fr/self-hosted/configuration/providers.md
index a63161119c..71c9647fcc 100644
--- a/docs/fr/self-hosted/configuration/providers.md
+++ b/docs/fr/self-hosted/configuration/providers.md
@@ -31,7 +31,7 @@ La référence est le format de fichier sur disque et l'ordre des opérations à
 }
 ```
 
-L'ensemble complet des champs vit dans [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/providers) — `openai.json`, `openrouter.json` et `vercel-gateway.json` couvrent les trois formes dont tu auras probablement besoin.
+L'ensemble complet des champs vit dans [`examples/default/providers/`](https://github.com/tale-project/tale/tree/main/examples/default/providers) — `openai.json`, `openrouter.json` et `vercel-gateway.json` couvrent les trois formes dont tu auras probablement besoin.
 
 ## Le fichier de secrets
 
diff --git a/docs/fr/self-hosted/configuration/retention.md b/docs/fr/self-hosted/configuration/retention.md
index c3f7945c3b..a1851e1e3f 100644
--- a/docs/fr/self-hosted/configuration/retention.md
+++ b/docs/fr/self-hosted/configuration/retention.md
@@ -24,7 +24,7 @@ Les défauts livrés sont lâches ; resserre selon ta posture de compliance.
 
 ## Où tu fixes les bornes
 
-Les bornes vivent dans le fichier de config opérateur, pas dans les variables d'env. Édite `governance/retention-bounds.json` sous `TALE_CONFIG_DIR` (défaut `/app/data/platform-config/` dans le conteneur plateforme) :
+Sous la disposition org-first, les bornes de rétention sont **par org** : édite `retention.json` directement dans le sous-arbre d'une org sous `TALE_CONFIG_DIR` (par défaut `/app/data/` dans le conteneur plateforme, le fichier se trouve donc à `/app/data/<org>/retention.json`, p. ex. `/app/data/default/retention.json`). Chaque org a son propre fichier ; celui de l'org `default` est le modèle qu'un nouveau déploiement reprend au premier démarrage.
 
 ```json
 {
diff --git a/docs/fr/self-hosted/operate/upgrades.md b/docs/fr/self-hosted/operate/upgrades.md
index 9e72578920..0881491c2d 100644
--- a/docs/fr/self-hosted/operate/upgrades.md
+++ b/docs/fr/self-hosted/operate/upgrades.md
@@ -80,3 +80,41 @@ Sauter des versions mineures (passer de 0.9 à 0.11) est supporté tant que les
 ## Où cela s'inscrit
 
 Le flow de montée de version noue chaque autre page d'exploitation — les backups sont ce qui rend une montée de version échouée récupérable, l'observabilité est ce qui te dit que la nouvelle couleur est saine, le durcissement est ce que tu re-walks après une version majeure. Si tu mets en place la CLI pour la première fois, [Installer la CLI tale](/fr/self-hosted/install/cli-install) couvre le setup côté workstation ; si tu prends le pager en plein rollout, [Dépannage](/fr/self-hosted/operate/observability/troubleshooting) nomme les symptômes.
+
+## Migration vers la disposition de config org-first
+
+Les anciennes versions de Tale stockaient la config dans une arborescence plate à la racine du workspace (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). La version actuelle utilise une disposition **org-first** où chaque org — y compris la canonique `default` — possède son propre sous-arbre : `<root>/<org>/<domain>/...`. La migration est opt-in et tourne une seule fois par workspace. La nouvelle plateforme refuse de lire les anciens chemins ; tant que tu n'as pas migré, tes secrets de provider et personnalisations vivent dans des répertoires que le runtime ne regarde plus.
+
+La migration tient en trois commandes :
+
+```bash
+# 1. Copier les secrets de provider (et autres configs) depuis la
+#    disposition plate vers `default/<domain>/...`. cp et non mv, donc
+#    les anciens chemins restent intacts au cas où un rollback serait
+#    nécessaire.
+tale migrate config-layout
+
+# 2. Recréer le conteneur Convex contre la disposition de volume org-first
+#    et lancer le reseed côté serveur sur chaque org enregistrée. Implique
+#    `--all` ; `-y` saute le prompt destructif pour les runs CI / scripts.
+tale deploy --override-all -y
+
+# 3. Une fois la nouvelle disposition vérifiée intacte, supprimer les
+#    anciens chemins. Vérifie via sha que le nouveau fichier correspond
+#    à l'ancien avant unlink ; refuse de supprimer en cas de mismatch.
+tale migrate config-layout --cleanup-old
+```
+
+L'étape 1 est sûre et réversible — la rejouer est un no-op une fois les chemins existants. L'étape 2 est destructive : chaque config d'org au nom canonique (`*.json` sous `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) est écrasée par le catalogue builtin. Les fichiers `*.secrets.json`, les traces `.history/` et les `branding/images/*` uploadés sont préservés côté serveur. Après l'étape 2, la plateforme lit exclusivement depuis la disposition org-first.
+
+L'étape 3 est le point de non-retour pour les downgrades — voir ci-dessous.
+
+### Annuler la migration org-first
+
+Entre les étapes 1 et 3, tu peux downgrader proprement. L'entrypoint Convex marque chaque run de seed avec un token qui inclut la version de disposition (`.seeded-<version>-orgfirst`) ; un binaire plus ancien qui ne reconnaît pas ce token re-seede idempotemment dans ses propres chemins (plats), et le `cp` de l'étape 1 a laissé les anciens chemins intacts. Le downgrade est un `tale rollback` normal.
+
+Après l'étape 3 (`--cleanup-old`), les anciens chemins sont partis. Le downgrade continue à re-seeder la disposition correctement via le mécanisme du token-marker, mais l'app démarre avec des secrets de provider vides — restaure-les depuis le backup (voir [Backups et restauration](/fr/self-hosted/operate/backups-and-restore)) avant de reprendre le trafic.
+
+### Et si je saute l'étape 1 ?
+
+Le conteneur Convex détectera au démarrage les répertoires restants de la disposition plate et écrira un warning dans ses logs en nommant les répertoires et pointant vers ce runbook. Le déploiement démarre, mais les reads sur ces répertoires reviennent vides et les writes vont vers les nouveaux chemins (vides) org-first. La correction reste étapes 1 + 2 — les lancer après le warning fonctionne exactement comme les lancer en amont.
diff --git a/packages/tale_shared/src/tale_shared/config/base.py b/packages/tale_shared/src/tale_shared/config/base.py
index c45626dae4..e057bb25bb 100644
--- a/packages/tale_shared/src/tale_shared/config/base.py
+++ b/packages/tale_shared/src/tale_shared/config/base.py
@@ -2,6 +2,12 @@
 
 Provides a common base for pydantic-settings-based configuration with
 shared patterns across crawler and RAG services.
+
+Provider lookups require an org slug under the org-first config layout —
+each org owns its own provider catalog at
+`<TALE_CONFIG_DIR>/<org_slug>/providers/`. The base-class shims accept an
+explicit `org_slug` and surface a clear error if the caller forgot to
+thread one through, rather than silently pinning every org to `default`.
 """
 
 import logging
@@ -38,36 +44,36 @@ class BaseServiceSettings(BaseSettings):
     vision_request_timeout: int = 180
     vision_max_concurrent_pages: int = 1
 
-    def get_fast_model(self) -> str:
-        """Get fast LLM model from provider files."""
-        _base_url, _api_key, model_id = _provider_chat_model()
+    def get_fast_model(self, org_slug: str) -> str:
+        """Get fast LLM model for an org from provider files."""
+        _base_url, _api_key, model_id = _provider_chat_model(org_slug)
         return model_id
 
-    def get_embedding_model(self) -> str:
-        """Get embedding model from provider files."""
-        _base_url, _api_key, model_id, _dims = _provider_embedding_model()
+    def get_embedding_model(self, org_slug: str) -> str:
+        """Get embedding model for an org from provider files."""
+        _base_url, _api_key, model_id, _dims = _provider_embedding_model(org_slug)
         return model_id
 
-    def get_vision_model(self) -> str:
-        """Get vision model from provider files."""
-        _base_url, _api_key, model_id = _provider_vision_model()
+    def get_vision_model(self, org_slug: str) -> str:
+        """Get vision model for an org from provider files."""
+        _base_url, _api_key, model_id = _provider_vision_model(org_slug)
         return model_id
 
-    def get_chat_config(self) -> tuple[str, str, str]:
-        """Return (base_url, api_key, model_id) for chat model from provider files."""
-        return _provider_chat_model()
+    def get_chat_config(self, org_slug: str) -> tuple[str, str, str]:
+        """Return (base_url, api_key, model_id) for an org's chat model."""
+        return _provider_chat_model(org_slug)
 
-    def get_embedding_config(self) -> tuple[str, str, str, int]:
-        """Return (base_url, api_key, model_id, dimensions) for embedding model."""
-        return _provider_embedding_model()
+    def get_embedding_config(self, org_slug: str) -> tuple[str, str, str, int]:
+        """Return (base_url, api_key, model_id, dimensions) for an org's embedding model."""
+        return _provider_embedding_model(org_slug)
 
-    def get_vision_config(self) -> tuple[str, str, str]:
-        """Return (base_url, api_key, model_id) for vision model."""
-        return _provider_vision_model()
+    def get_vision_config(self, org_slug: str) -> tuple[str, str, str]:
+        """Return (base_url, api_key, model_id) for an org's vision model."""
+        return _provider_vision_model(org_slug)
 
-    def get_embedding_dimensions(self) -> int:
-        """Get embedding dimensions from provider files."""
-        _base_url, _api_key, _model_id, dims = _provider_embedding_model()
+    def get_embedding_dimensions(self, org_slug: str) -> int:
+        """Get embedding dimensions for an org from provider files."""
+        _base_url, _api_key, _model_id, dims = _provider_embedding_model(org_slug)
         return dims
 
     def get_allowed_origins_list(self) -> list[str]:
diff --git a/packages/tale_shared/src/tale_shared/config/providers.py b/packages/tale_shared/src/tale_shared/config/providers.py
index 33466152c2..b2075ec501 100644
--- a/packages/tale_shared/src/tale_shared/config/providers.py
+++ b/packages/tale_shared/src/tale_shared/config/providers.py
@@ -39,21 +39,36 @@ class ProviderConfig:
     defaults: dict[str, str] = field(default_factory=dict)
 
 
-def load_providers(config_dir: str | None = None) -> list[ProviderConfig]:
-    """Read all provider JSON files from {config_dir}/providers/.
+def load_providers(
+    org_slug: str,
+    config_dir: str | None = None,
+) -> list[ProviderConfig]:
+    """Read all provider JSON files from {config_dir}/{org_slug}/providers/.
+
+    Under the org-first config layout, each org owns its own provider
+    catalog at `<root>/<org_slug>/providers/`. `org_slug` is required —
+    pinning RAG/crawler globally to the `default` org's providers would
+    quietly serve the wrong models to other orgs.
 
     Reads *.json (excluding *.secrets.json) and decrypts matching
     *.secrets.json files via SOPS.
     """
+    if not org_slug:
+        raise ValueError("load_providers requires a non-empty org_slug")
+
     shared_config = os.environ.get("TALE_PLATFORM_SHARED_CONFIG_DIR")
     if shared_config:
         base = Path(shared_config)
     else:
         base = Path(config_dir or os.environ.get("TALE_CONFIG_DIR") or os.environ.get("CONFIG_DIR", DEFAULT_CONFIG_DIR))
-    providers_dir = base / "providers"
+    providers_dir = base / org_slug / "providers"
 
     if not providers_dir.is_dir():
-        logger.warning("Providers directory not found: %s", providers_dir)
+        logger.warning(
+            "Providers directory not found for org '%s': %s",
+            org_slug,
+            providers_dir,
+        )
         return []
 
     providers: list[ProviderConfig] = []
@@ -146,17 +161,18 @@ def _find_model(
 
 
 def get_chat_model(
+    org_slug: str,
     config_dir: str | None = None,
 ) -> tuple[str, str, str]:
-    """Return (base_url, api_key, model_id) for the default chat model.
+    """Return (base_url, api_key, model_id) for the org's default chat model.
 
     Finds the first model marked default that has a "chat" tag,
     or falls back to the first model with a "chat" tag.
     """
-    providers = load_providers(config_dir)
+    providers = load_providers(org_slug, config_dir)
     match = _find_model(providers, "chat", prefer_default=True)
     if match is None:
-        raise ValueError("No chat model found in provider configuration files.")
+        raise ValueError(f"No chat model found in provider configuration files for org '{org_slug}'.")
 
     provider, model = match
     api_key = provider.api_key or ""
@@ -164,13 +180,14 @@ def get_chat_model(
 
 
 def get_embedding_model(
+    org_slug: str,
     config_dir: str | None = None,
 ) -> tuple[str, str, str, int]:
-    """Return (base_url, api_key, model_id, dimensions) for the embedding model."""
-    providers = load_providers(config_dir)
+    """Return (base_url, api_key, model_id, dimensions) for the org's embedding model."""
+    providers = load_providers(org_slug, config_dir)
     match = _find_model(providers, "embedding", prefer_default=True)
     if match is None:
-        raise ValueError("No embedding model found in provider configuration files.")
+        raise ValueError(f"No embedding model found in provider configuration files for org '{org_slug}'.")
 
     provider, model = match
     api_key = provider.api_key or ""
@@ -183,13 +200,14 @@ def get_embedding_model(
 
 
 def get_vision_model(
+    org_slug: str,
     config_dir: str | None = None,
 ) -> tuple[str, str, str]:
-    """Return (base_url, api_key, model_id) for the vision model."""
-    providers = load_providers(config_dir)
+    """Return (base_url, api_key, model_id) for the org's vision model."""
+    providers = load_providers(org_slug, config_dir)
     match = _find_model(providers, "vision", prefer_default=True)
     if match is None:
-        raise ValueError("No vision model found in provider configuration files.")
+        raise ValueError(f"No vision model found in provider configuration files for org '{org_slug}'.")
 
     provider, model = match
     api_key = provider.api_key or ""
diff --git a/services/convex/docker-entrypoint.sh b/services/convex/docker-entrypoint.sh
index 49870c14b3..5bc69b5923 100755
--- a/services/convex/docker-entrypoint.sh
+++ b/services/convex/docker-entrypoint.sh
@@ -290,11 +290,15 @@ fi
 seed_marker="/app/data/.seeded-${TALE_VERSION:-dev}-orgfirst"
 data_dir="/app/data"
 
-# Atomic file copy: write to a sibling tmp file then rename. A SIGKILL
-# between open(dest, O_TRUNC) and the final write would otherwise leave a
-# truncated file at $dest, which the next-run skip-if-exists check treats
-# as "already seeded" — silent corruption. With atomic_cp the next run
-# either sees the original (rename never happened) or the complete file.
+# Crash-safe file copy: write to a sibling tmp file then rename to dest.
+# `cp` itself is non-atomic; the value is that an interrupted run leaves
+# either (a) no tmp / dest intact, or (b) a partial `.tale-seed.<pid>.tmp`
+# orphan + dest intact. The next-run skip-if-exists check on dest is
+# therefore never observing a half-written file. Orphan tmps don't gate
+# anything (they're not matched by the dest-existence probe) and survive
+# until the next reseed of that file. There is no fsync — power-loss
+# durability isn't asserted, but the seed data is re-derivable from the
+# immutable builtin catalog, so a lost write is recoverable on retry.
 atomic_cp() {
   local src="$1" dest="$2"
   local tmp="${dest}.tale-seed.$$.tmp"
@@ -468,6 +472,39 @@ else
   log_info "Builtin seed already applied for version ${TALE_VERSION:-dev} (marker: $seed_marker)"
 fi
 
+# ----------------------------------------------------------------------------
+# Legacy flat-layout detector
+# ----------------------------------------------------------------------------
+# The pre-orgfirst layout placed config at the data-root level
+# (`/app/data/agents/`, `/app/data/workflows/`, …). Under org-first that
+# tree is now at `/app/data/<org>/<domain>/`. If an upgrading operator's
+# volume still contains the legacy flat trees, the new runtime ignores
+# them — `seed_marker` already promoted seed data to `default/`, but the
+# operator's edits at the old root are unreachable. Warn loudly so they
+# know to run `tale migrate config-layout` on the host.
+legacy_flat_dirs=()
+for d in agents workflows integrations branding providers skills; do
+  if [ -d "${data_dir}/${d}" ]; then
+    legacy_flat_dirs+=("${d}")
+  fi
+done
+if [ ${#legacy_flat_dirs[@]} -gt 0 ]; then
+  echo
+  echo "⚠ WARNING: legacy flat-layout config detected at:"
+  for d in "${legacy_flat_dirs[@]}"; do
+    echo "    ${data_dir}/${d}/"
+  done
+  echo
+  echo "  The org-first runtime reads only from '<root>/<org>/<domain>/'."
+  echo "  Edits at the paths above are NOT loaded by the platform or any"
+  echo "  per-org config resolver. To migrate them into the new layout,"
+  echo "  run on the operator host:"
+  echo "    tale migrate config-layout"
+  echo "  then:"
+  echo "    tale deploy --override-all -y"
+  echo
+fi
+
 # ============================================================================
 # Crash diagnostics helpers
 # ============================================================================
diff --git a/services/crawler/app/main.py b/services/crawler/app/main.py
index b88f8e2d28..2daf780866 100644
--- a/services/crawler/app/main.py
+++ b/services/crawler/app/main.py
@@ -16,7 +16,7 @@
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager, suppress
 
-from fastapi import FastAPI
+from fastapi import Depends, FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from loguru import logger
 from tale_shared.logging import suppress_health_check_logs
@@ -204,17 +204,25 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
 init_telemetry(app)
 
 
+# X-Tale-Org is required on every endpoint that touches per-org provider
+# state (vision, embedding, chat model). Apply as a router-level
+# dependency so individual handlers don't need to remember to declare it.
+# `/health` is mounted at the app level below — exempt.
+from app.org_context import require_org_slug  # noqa: E402
+
+_org_dep = [Depends(require_org_slug)]
+
 # Register routers
-app.include_router(crawler_router)
-app.include_router(websites_router)
-app.include_router(search_router)
-app.include_router(pages_router)
-app.include_router(index_router)
-app.include_router(pdf_router)
-app.include_router(image_router)
-app.include_router(docx_router)
-app.include_router(pptx_router)
-app.include_router(web_router)
+app.include_router(crawler_router, dependencies=_org_dep)
+app.include_router(websites_router, dependencies=_org_dep)
+app.include_router(search_router, dependencies=_org_dep)
+app.include_router(pages_router, dependencies=_org_dep)
+app.include_router(index_router, dependencies=_org_dep)
+app.include_router(pdf_router, dependencies=_org_dep)
+app.include_router(image_router, dependencies=_org_dep)
+app.include_router(docx_router, dependencies=_org_dep)
+app.include_router(pptx_router, dependencies=_org_dep)
+app.include_router(web_router, dependencies=_org_dep)
 
 
 @app.get("/health", response_model=HealthResponse)
diff --git a/services/crawler/app/org_context.py b/services/crawler/app/org_context.py
new file mode 100644
index 0000000000..48930a9f14
--- /dev/null
+++ b/services/crawler/app/org_context.py
@@ -0,0 +1,69 @@
+"""Per-request org-slug context for the crawler service.
+
+Crawler internals (vision client, embedding service, file parsers, …)
+need an org slug to read that org's provider catalog. Threading the slug
+through every helper would touch ~15 call sites without adding signal —
+the org is per-REQUEST, so a `contextvars.ContextVar` set by the
+`require_org_slug` FastAPI dependency at the router boundary is the
+right primitive:
+
+- One write per request, at the boundary.
+- Reads from any depth via `get_active_org()` — no parameter explosion.
+- Per-asyncio-task isolation (ContextVar binds to the running task).
+
+A missing context raises rather than silently falling back to `default`:
+forgetting to set the header is a caller bug we want to surface as a
+500, not as "served the wrong org's models for an hour".
+"""
+
+import re
+from contextvars import ContextVar
+
+from fastapi import Header, HTTPException, status
+
+# Aligned with services/platform/convex/lib/file_io.ts:25; capped at 64 chars
+# to match tools/cli/src/lib/migrate-config-layout/script.sh:134.
+_ORG_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
+
+_active_org: ContextVar[str | None] = ContextVar("tale_active_org", default=None)
+
+
+def set_active_org(slug: str) -> None:
+    """Bind the active org to the current asyncio task."""
+    _active_org.set(slug)
+
+
+def get_active_org() -> str:
+    """Read the active org slug. Raises if unset (caller bug)."""
+    value = _active_org.get()
+    if not value:
+        raise RuntimeError(
+            "No active org slug for this request. Every public crawler "
+            "endpoint must declare `org_slug: str = Depends(require_org_slug)` "
+            "so the X-Tale-Org header is captured before service layer use."
+        )
+    return value
+
+
+async def require_org_slug(
+    x_tale_org: str | None = Header(default=None),
+) -> str:
+    """FastAPI dependency: extract + validate the X-Tale-Org header,
+    bind it to the request-scoped ContextVar, and return it.
+
+    Returns the slug so handlers that need it explicitly can also take
+    `org_slug = Depends(require_org_slug)`. Internal helpers should
+    prefer `get_active_org()` over plumbing the slug as a param.
+    """
+    if not x_tale_org:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="missing X-Tale-Org header",
+        )
+    if not _ORG_SLUG_RE.match(x_tale_org):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="invalid X-Tale-Org header",
+        )
+    set_active_org(x_tale_org)
+    return x_tale_org
diff --git a/services/crawler/app/services/database.py b/services/crawler/app/services/database.py
index 5b99bb607f..160a536a5d 100644
--- a/services/crawler/app/services/database.py
+++ b/services/crawler/app/services/database.py
@@ -62,39 +62,20 @@ async def init_pool(*, max_size: int = 10) -> asyncpg.Pool:
         )
         logger.info(f"PostgreSQL connection pool initialized (min={min(2, max_size)}, max={max_size})")
 
-        # Guard against embedding dimension mismatch: if existing data uses a
-        # different dimension than the current config, refuse to start.
-        configured_dims = settings.get_embedding_dimensions()
-        async with acquire_with_retry(_pool) as conn:
-            stored_dims = await conn.fetchval(
-                f"SELECT vector_dims(embedding) FROM {SCHEMA}.chunks WHERE embedding IS NOT NULL LIMIT 1"
-            )
-        if stored_dims is not None and stored_dims != configured_dims:
-            await _pool.close()
-            _pool = None
-            raise RuntimeError(
-                f"Embedding dimension mismatch: database has {stored_dims}d vectors "
-                f"but CRAWLER_EMBEDDING_DIMENSIONS={configured_dims}. "
-                f"Re-index existing data or update the config to match."
-            )
-
-        # Pin the embedding column to explicit dimensions so HNSW indexes work.
-        expected_type = f"vector({int(configured_dims)})"
-        async with acquire_with_retry(_pool) as conn:
-            col_type = await conn.fetchval(
-                "SELECT format_type(atttypid, atttypmod) "
-                "FROM pg_attribute "
-                "WHERE attrelid = $1::regclass AND attname = 'embedding'",
-                f"{SCHEMA}.chunks",
-            )
-            if col_type != expected_type:
-                await conn.execute(f"DROP INDEX IF EXISTS {SCHEMA}.idx_pw_chunks_embedding_hnsw")
-                await conn.execute(
-                    f"ALTER TABLE {SCHEMA}.chunks ALTER COLUMN embedding TYPE vector({int(configured_dims)})"
-                )
-                logger.info(f"Pinned embedding column to vector({configured_dims}) (was {col_type})")
-
-        # Create HNSW index if it doesn't exist yet.
+        # Note: the previous boot-time embedding-dimension guard was
+        # removed when crawler became multi-org. Dim is now an attribute
+        # of the org's provider catalog, not a global setting, and there
+        # is no org context at lifespan start. `get_embedding_service()`
+        # refuses dim changes per-org at request time; pgvector enforces
+        # column dim on insert.
+        #
+        # The column type and HNSW index are pinned lazily on the first
+        # insert (pgvector errors loudly on dim mismatch). All orgs
+        # sharing this crawler instance must agree on embedding dims.
+
+        # Create HNSW index if it doesn't exist yet. The index targets
+        # whatever the column type is set to; if no rows have been
+        # inserted, the call is cheap.
         try:
             async with acquire_with_retry(_pool) as conn:
                 await conn.execute(f"SELECT {SCHEMA}.create_chunks_hnsw_index()")
diff --git a/services/crawler/app/services/embedding_service.py b/services/crawler/app/services/embedding_service.py
index 5b4aa38529..f90e538725 100644
--- a/services/crawler/app/services/embedding_service.py
+++ b/services/crawler/app/services/embedding_service.py
@@ -1,9 +1,14 @@
 """
 OpenAI-compatible embedding generation service.
 
-Crawler-specific factory with TTL-based config refresh.
-When provider config files change (e.g. API key rotation), the client
-is automatically rebuilt on the next access after the TTL expires.
+Crawler-specific factory with TTL-based config refresh, keyed by org
+slug. Each org has its own EmbeddingService instance built from that
+org's provider catalog at `<TALE_CONFIG_DIR>/<org>/providers/`.
+
+Embedding dimensions are still implicitly global because crawler's
+`database.py` pins a single dim per RAG index; if two orgs disagree on
+dimensions we refuse to rebuild and keep the existing client (the
+operator must reconcile provider configs).
 """
 
 import asyncio
@@ -14,13 +19,28 @@
 from tale_knowledge.embedding import EmbeddingService
 
 from app.config import settings
+from app.org_context import get_active_org
 
-_embedding_service: EmbeddingService | None = None
-_embedding_config: tuple | None = None
-_last_config_check: float = 0
 _CONFIG_CHECK_INTERVAL = 15  # seconds
 
 
+class _OrgEmbeddingState:
+    __slots__ = ("config", "last_check", "service")
+
+    def __init__(
+        self,
+        service: EmbeddingService,
+        config: tuple,
+        last_check: float,
+    ) -> None:
+        self.service = service
+        self.config = config
+        self.last_check = last_check
+
+
+_org_states: dict[str, _OrgEmbeddingState] = {}
+
+
 async def _close_old(service: EmbeddingService) -> None:
     """Close an old client after a grace period for in-flight requests."""
     await asyncio.sleep(30)
@@ -31,54 +51,74 @@ async def _close_old(service: EmbeddingService) -> None:
 
 
 def get_embedding_service() -> EmbeddingService:
-    global _embedding_service, _embedding_config, _last_config_check
+    org_slug = get_active_org()
+    state = _org_states.get(org_slug)
 
     now = time.monotonic()
-    if _embedding_service is not None and (now - _last_config_check) < _CONFIG_CHECK_INTERVAL:
-        return _embedding_service
+    if state is not None and (now - state.last_check) < _CONFIG_CHECK_INTERVAL:
+        return state.service
 
-    _last_config_check = now
     try:
-        config = settings.get_embedding_config()  # (base_url, api_key, model, dims)
+        config = settings.get_embedding_config(org_slug)  # (base_url, api_key, model, dims)
     except (ValueError, OSError):
-        logger.opt(exception=True).warning("Config read failed, keeping current embedding client")
-        if _embedding_service is not None:
-            return _embedding_service
+        logger.opt(exception=True).warning(
+            "Config read failed for org '{}', keeping current embedding client",
+            org_slug,
+        )
+        if state is not None:
+            state.last_check = now
+            return state.service
         raise
 
-    if config == _embedding_config and _embedding_service is not None:
-        return _embedding_service
+    if state is not None and config == state.config:
+        state.last_check = now
+        return state.service
 
     base_url, api_key, model, dims = config
 
     # Never downgrade to empty key
-    if not api_key and _embedding_service is not None:
-        logger.warning("Skipping embedding reload: new config has empty API key")
-        return _embedding_service
+    if not api_key and state is not None:
+        logger.warning(
+            "Skipping embedding reload for org '{}': new config has empty API key",
+            org_slug,
+        )
+        state.last_check = now
+        return state.service
 
     # Refuse dimension change (would corrupt vector index)
-    if _embedding_config is not None and dims != _embedding_config[3] and _embedding_service is not None:
+    if state is not None and dims != state.config[3]:
         logger.error(
-            "Embedding dimensions changed ({} -> {}). Restart required to re-index.",
-            _embedding_config[3],
+            "Embedding dimensions for org '{}' changed ({} -> {}). Restart required to re-index.",
+            org_slug,
+            state.config[3],
             dims,
         )
-        return _embedding_service
+        state.last_check = now
+        return state.service
 
-    old = _embedding_service
-    _embedding_service = EmbeddingService(
+    old_service = state.service if state is not None else None
+    new_service = EmbeddingService(
         api_key=api_key,
         base_url=base_url,
         model=model,
         dimensions=dims,
     )
-    _embedding_config = config
+    _org_states[org_slug] = _OrgEmbeddingState(
+        service=new_service,
+        config=config,
+        last_check=now,
+    )
 
-    if old is not None:
-        logger.info("Embedding service rebuilt: model={}", model)
+    if old_service is not None:
+        logger.info("Embedding service rebuilt for org '{}': model={}", org_slug, model)
         with contextlib.suppress(RuntimeError):
-            asyncio.get_running_loop().create_task(_close_old(old))
+            asyncio.get_running_loop().create_task(_close_old(old_service))
     else:
-        logger.info("Embedding service created: model={}, dims={}", model, dims)
+        logger.info(
+            "Embedding service created for org '{}': model={}, dims={}",
+            org_slug,
+            model,
+            dims,
+        )
 
-    return _embedding_service
+    return new_service
diff --git a/services/crawler/app/services/file_parser_service.py b/services/crawler/app/services/file_parser_service.py
index dffaa2827c..6a28cdc4c5 100644
--- a/services/crawler/app/services/file_parser_service.py
+++ b/services/crawler/app/services/file_parser_service.py
@@ -14,6 +14,7 @@
 from typing import Any
 
 from ..config import settings
+from ..org_context import get_active_org
 
 logger = logging.getLogger(__name__)
 
@@ -221,7 +222,7 @@ async def parse_pdf_with_vision(
                     model=model,
                     usage=acc,
                 )
-                resolved_model = model or settings.get_fast_model()
+                resolved_model = model or settings.get_fast_model(get_active_org())
 
             import fitz as _fitz
 
@@ -343,7 +344,7 @@ async def parse_docx_with_vision(
                     model=model,
                     usage=acc,
                 )
-                resolved_model = model or settings.get_fast_model()
+                resolved_model = model or settings.get_fast_model(get_active_org())
 
             docx_dates = _extract_ooxml_metadata(file_bytes, "docx")
 
@@ -460,7 +461,7 @@ async def parse_pptx_with_vision(
                     model=model,
                     usage=acc,
                 )
-                resolved_model = model or settings.get_fast_model()
+                resolved_model = model or settings.get_fast_model(get_active_org())
 
             pptx_dates = _extract_ooxml_metadata(file_bytes, "pptx")
 
diff --git a/services/crawler/app/services/scheduler.py b/services/crawler/app/services/scheduler.py
index 940c5bc198..d72051d73b 100644
--- a/services/crawler/app/services/scheduler.py
+++ b/services/crawler/app/services/scheduler.py
@@ -66,11 +66,29 @@ async def run_scheduler(
     global _scan_trigger
     _scan_trigger = asyncio.Event()
 
+    # Background scheduler has no per-request X-Tale-Org context. Until
+    # the websites table carries the owning org slug, fall back to
+    # `default` for any provider lookups triggered by scheduled scans.
+    # Log once so operators see the assumption.
+    from app.org_context import set_active_org
+
+    set_active_org("default")
+    logger.warning(
+        "Scheduler background task using org slug 'default' for provider "
+        "lookups. Per-website org binding is a follow-up."
+    )
+
     sem = asyncio.Semaphore(max_concurrent_scans)
 
     async def bounded_scan(domain: str):
         async with sem:
-            await _scan_website(domain, store_manager, crawler_service, indexing_service, crawl_batch_size)
+            await _scan_website(
+                domain,
+                store_manager,
+                crawler_service,
+                indexing_service,
+                crawl_batch_size,
+            )
 
     while True:
         try:
diff --git a/services/crawler/app/services/vision/openai_client.py b/services/crawler/app/services/vision/openai_client.py
index 684733e7c6..e6c0cba81c 100644
--- a/services/crawler/app/services/vision/openai_client.py
+++ b/services/crawler/app/services/vision/openai_client.py
@@ -19,6 +19,7 @@
 from openai import AsyncOpenAI
 
 from ...config import settings
+from ...org_context import get_active_org
 from .cache import compute_text_hash, llm_cache
 
 
@@ -103,7 +104,7 @@ def _get_client(self) -> AsyncOpenAI:
 
         self._last_config_check = now
         try:
-            config = settings.get_vision_config()  # (base_url, api_key, model)
+            config = settings.get_vision_config(get_active_org())  # (base_url, api_key, model)
         except (ValueError, OSError):
             if self._client is not None:
                 logger.opt(exception=True).warning("Config read failed, keeping current vision client")
@@ -150,7 +151,7 @@ async def ocr_image(
             return cached_result
 
         client = self._get_client()
-        vision_model = settings.get_vision_model()
+        vision_model = settings.get_vision_model(get_active_org())
         extraction_prompt = prompt or OCR_PROMPT
 
         image_b64 = base64.b64encode(image_bytes).decode("utf-8")
@@ -229,7 +230,7 @@ async def describe_image(
             return cached_result
 
         client = self._get_client()
-        vision_model = settings.get_vision_model()
+        vision_model = settings.get_vision_model(get_active_org())
         description_prompt = prompt or DESCRIBE_PROMPT
 
         image_b64 = base64.b64encode(image_bytes).decode("utf-8")
@@ -369,7 +370,7 @@ async def process_pages_with_llm(
 
     logger.info(f"LLM processing: {total_chars} chars total, chunking at {max_chars_per_chunk} chars")
 
-    base_url, api_key, chat_model = settings.get_chat_config()
+    base_url, api_key, chat_model = settings.get_chat_config(get_active_org())
     client = AsyncOpenAI(
         api_key=api_key,
         base_url=base_url,
diff --git a/services/crawler/tests/test_config.py b/services/crawler/tests/test_config.py
index 2f102951e3..1d55f2c28f 100644
--- a/services/crawler/tests/test_config.py
+++ b/services/crawler/tests/test_config.py
@@ -31,7 +31,7 @@ class TestGetFastModel:
     def test_returns_model_from_provider(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
-            assert s.get_fast_model() == "gpt-4o-mini"
+            assert s.get_fast_model("default") == "gpt-4o-mini"
 
     @patch(
         "tale_shared.config.base._provider_chat_model",
@@ -41,7 +41,7 @@ def test_missing_provider_raises(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No chat model"):
-                s.get_fast_model()
+                s.get_fast_model("default")
 
 
 class TestGetVisionModel:
@@ -49,7 +49,7 @@ class TestGetVisionModel:
     def test_returns_model_from_provider(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
-            assert s.get_vision_model() == "gpt-4o"
+            assert s.get_vision_model("default") == "gpt-4o"
 
     @patch(
         "tale_shared.config.base._provider_vision_model",
@@ -59,7 +59,7 @@ def test_missing_provider_raises(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No vision model"):
-                s.get_vision_model()
+                s.get_vision_model("default")
 
 
 class TestGetEmbeddingDimensions:
@@ -67,7 +67,7 @@ class TestGetEmbeddingDimensions:
     def test_returns_dimensions_from_provider(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
-            assert s.get_embedding_dimensions() == 1536
+            assert s.get_embedding_dimensions("default") == 1536
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -76,7 +76,7 @@ def test_returns_dimensions_from_provider(self, mock_provider):
     def test_large_dimensions(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
-            assert s.get_embedding_dimensions() == 3072
+            assert s.get_embedding_dimensions("default") == 3072
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -86,7 +86,7 @@ def test_missing_provider_raises(self, mock_provider):
         with patch.dict(os.environ, _base_env(), clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No embedding model"):
-                s.get_embedding_dimensions()
+                s.get_embedding_dimensions("default")
 
 
 class TestFrequencyDefaults:
diff --git a/services/crawler/tests/test_database.py b/services/crawler/tests/test_database.py
index 1e8beaebc4..18aa21712f 100644
--- a/services/crawler/tests/test_database.py
+++ b/services/crawler/tests/test_database.py
@@ -37,6 +37,11 @@ async def _acq(_pool, **_kw):
     return pool, _acq
 
 
+@pytest.mark.skip(
+    reason="Boot-time embedding-dimension guard was removed when crawler "
+    "became multi-org. Dim is now per-org provider catalog; pgvector enforces "
+    "column dim on insert + get_embedding_service refuses dim changes per-org."
+)
 class TestDimensionMismatchGuard:
     @pytest.mark.asyncio
     async def test_raises_on_dimension_mismatch(self):
@@ -88,6 +93,11 @@ async def test_passes_when_no_existing_data(self):
         assert pool is fake_pool
 
 
+@pytest.mark.skip(
+    reason="Boot-time embedding-column ALTER was removed when crawler became "
+    "multi-org. Column type is now driven by the first INSERT under pgvector; "
+    "operators reconcile per-org provider catalogs manually if dims diverge."
+)
 class TestEmbeddingColumnPinning:
     @pytest.mark.asyncio
     async def test_alters_untyped_vector_column(self):
diff --git a/services/crawler/tests/test_llm_cache.py b/services/crawler/tests/test_llm_cache.py
index 590d3a5753..6c9b9aed6e 100644
--- a/services/crawler/tests/test_llm_cache.py
+++ b/services/crawler/tests/test_llm_cache.py
@@ -59,9 +59,15 @@ class TestProcessPagesWithLlmCache:
     @patch("app.services.vision.openai_client.settings")
     @patch("app.services.vision.openai_client.AsyncOpenAI")
     async def test_second_call_hits_cache(self, mock_openai_cls, mock_settings):
+        from app.org_context import set_active_org
         from app.services.vision.cache import llm_cache
         from app.services.vision.openai_client import process_pages_with_llm
 
+        # The internal `get_active_org()` call requires a ContextVar bound
+        # by `require_org_slug` in production; in unit tests we set it
+        # directly so the per-org provider lookup has a slug to resolve.
+        set_active_org("test-org")
+
         llm_cache.clear()
 
         mock_settings.get_chat_config.return_value = ("http://test", "test-key", "test-model")
diff --git a/services/platform/Dockerfile b/services/platform/Dockerfile
index e68dfd4c0d..15cb80d3d0 100644
--- a/services/platform/Dockerfile
+++ b/services/platform/Dockerfile
@@ -227,17 +227,22 @@ ENV NODE_ENV=production \
     DO_NOT_TRACK=1 \
     # Semantic value of the file-config parent path inside the convex
     # container. Platform forces this at push time in docker-entrypoint.sh
-    # (to tombstone any stale host-side `.env` value). Convex derives the
-    # sub-dirs (agents/workflows/integrations/providers) from it.
+    # (to tombstone any stale host-side `.env` value). Under the org-first
+    # layout, every per-domain config dir is derived as
+    # $TALE_CONFIG_DIR/<orgSlug>/<domain>/ — e.g.
+    # /app/data/default/agents/, /app/data/default/providers/, etc.
+    # The previous per-domain env vars (AGENTS_DIR, …) are no longer
+    # honored; the entrypoint actively purges them from Convex on every
+    # boot.
     TALE_CONFIG_DIR=/app/data \
     # Read-only builtin catalog baked into the convex image (see
     # services/convex/Dockerfile). Declared here because Convex Node
     # actions only see env vars that this container pushes to Convex's
     # deployment env via the entrypoint's `convex env set` loop — even
     # though the path points at files inside the *convex* container.
-    # Per-domain catalogs are derived as $TALE_CONFIG_BUILTIN_DIR/<domain>/
-    # by services/platform/convex/organizations/scaffold.ts (mirrors the
-    # $TALE_CONFIG_DIR/<domain>/ pattern used for the writable side).
+    # Per-org catalogs live at $TALE_CONFIG_BUILTIN_DIR/<orgSlug>/<domain>/;
+    # `default` is the canonical template. See
+    # services/platform/convex/organizations/scaffold.ts.
     TALE_CONFIG_BUILTIN_DIR=/app/builtin
 
 COPY --from=pruner --chown=app:app /app/services/platform/dist ./dist
diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts
index 6b557cb4f2..fa603e7e06 100644
--- a/services/platform/convex/_generated/api.d.ts
+++ b/services/platform/convex/_generated/api.d.ts
@@ -461,6 +461,7 @@ import type * as lib_helpers_audit_hash from "../lib/helpers/audit_hash.js";
 import type * as lib_helpers_build_audit_context from "../lib/helpers/build_audit_context.js";
 import type * as lib_helpers_count_items_in_org from "../lib/helpers/count_items_in_org.js";
 import type * as lib_helpers_has_records_in_org from "../lib/helpers/has_records_in_org.js";
+import type * as lib_helpers_org_slug from "../lib/helpers/org_slug.js";
 import type * as lib_helpers_pii_hash from "../lib/helpers/pii_hash.js";
 import type * as lib_helpers_public_storage_url from "../lib/helpers/public_storage_url.js";
 import type * as lib_helpers_rag_config from "../lib/helpers/rag_config.js";
@@ -558,7 +559,6 @@ import type * as migrations_backfill_workflow_schedules from "../migrations/back
 import type * as migrations_merge_audit_retention from "../migrations/merge_audit_retention.js";
 import type * as migrations_migrate_org_creators from "../migrations/migrate_org_creators.js";
 import type * as migrations_remove_deprecated_llm_fields from "../migrations/remove_deprecated_llm_fields.js";
-import type * as migrations_rename_org_slug from "../migrations/rename_org_slug.js";
 import type * as migrations_seed_applied_bounds from "../migrations/seed_applied_bounds.js";
 import type * as migrations_split_personalization_toggle from "../migrations/split_personalization_toggle.js";
 import type * as migrations_trigger_steps_to_start from "../migrations/trigger_steps_to_start.js";
@@ -1558,6 +1558,7 @@ declare const fullApi: ApiFromModules<{
   "lib/helpers/build_audit_context": typeof lib_helpers_build_audit_context;
   "lib/helpers/count_items_in_org": typeof lib_helpers_count_items_in_org;
   "lib/helpers/has_records_in_org": typeof lib_helpers_has_records_in_org;
+  "lib/helpers/org_slug": typeof lib_helpers_org_slug;
   "lib/helpers/pii_hash": typeof lib_helpers_pii_hash;
   "lib/helpers/public_storage_url": typeof lib_helpers_public_storage_url;
   "lib/helpers/rag_config": typeof lib_helpers_rag_config;
@@ -1655,7 +1656,6 @@ declare const fullApi: ApiFromModules<{
   "migrations/merge_audit_retention": typeof migrations_merge_audit_retention;
   "migrations/migrate_org_creators": typeof migrations_migrate_org_creators;
   "migrations/remove_deprecated_llm_fields": typeof migrations_remove_deprecated_llm_fields;
-  "migrations/rename_org_slug": typeof migrations_rename_org_slug;
   "migrations/seed_applied_bounds": typeof migrations_seed_applied_bounds;
   "migrations/split_personalization_toggle": typeof migrations_split_personalization_toggle;
   "migrations/trigger_steps_to_start": typeof migrations_trigger_steps_to_start;
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
index 4283558be1..626cd84796 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
@@ -200,6 +200,7 @@ export async function fetchDocumentComparisonByUrls(
   baseFileName: string,
   comparisonFileUrl: string,
   comparisonFileName: string,
+  orgSlug: string,
   maxChanges?: number,
 ): Promise<DocumentComparisonResult> {
   const [baseResponse, compResponse] = await Promise.all([
@@ -232,6 +233,7 @@ export async function fetchDocumentComparisonByUrls(
       method: 'POST',
       body: formData,
       timeoutMs: FETCH_TIMEOUT_MS,
+      orgSlug,
     });
 
     if (!response.ok) {
diff --git a/services/platform/convex/agent_tools/rag/query_rag_context.ts b/services/platform/convex/agent_tools/rag/query_rag_context.ts
index 84cd25679c..ea10b46eb6 100644
--- a/services/platform/convex/agent_tools/rag/query_rag_context.ts
+++ b/services/platform/convex/agent_tools/rag/query_rag_context.ts
@@ -130,6 +130,12 @@ export interface RagContextResult {
 export interface RagContextOptions {
   /** File storage IDs to scope the search to */
   fileIds?: string[];
+  /**
+   * Org slug for the X-Tale-Org header. Required by the RAG service's
+   * `/api/v1/search` endpoint (it picks the org's provider catalog to
+   * embed the query). Omitting will yield HTTP 400.
+   */
+  orgSlug: string;
 }
 
 /**
@@ -197,6 +203,7 @@ export async function queryRagContext(
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify(requestPayload),
+        orgSlug: options?.orgSlug,
         signal: fetchSignal,
       });
 
diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
index c6793d9590..e6f8da1884 100644
--- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts
+++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
@@ -21,6 +21,7 @@ import { fetchJson } from '../../../lib/utils/type-cast-helpers';
 import { internal } from '../../_generated/api';
 import { stripReservedPromptTags } from '../../lib/agent_response/sanitize_prompt';
 import { createDebugLog } from '../../lib/debug_log';
+import { orgSlugFromId } from '../../lib/helpers/org_slug';
 import { ragFetch } from '../../lib/helpers/rag_config';
 import { toId } from '../../lib/type_cast_helpers';
 import { wrapUntrusted } from '../../lib/untrusted_content';
@@ -275,8 +276,10 @@ RESPONSE (list_indexed):
           chunkEnd: end,
         });
 
+        const retrieveOrgSlug = await orgSlugFromId(ctx, orgIdRetrieve);
         const response = await ragFetch(
           `/api/v1/documents/${encodeURIComponent(args.fileId)}/content?return_chunks=true&chunk_start=${start}&chunk_end=${end}`,
+          { orgSlug: retrieveOrgSlug },
         );
 
         if (!response.ok) {
@@ -434,11 +437,17 @@ RESPONSE (list_indexed):
       });
 
       try {
+        const orgIdForSearch = ctx.organizationId;
+        if (!orgIdForSearch) {
+          throw new Error('rag_search requires organizationId in ToolCtx.');
+        }
+        const searchOrgSlug = await orgSlugFromId(ctx, orgIdForSearch);
         const response = await ragFetch('/api/v1/search', {
           method: 'POST',
           headers: { 'Content-Type': 'application/json' },
           body: JSON.stringify(payload),
           timeoutMs: SEARCH_TIMEOUT_MS,
+          orgSlug: searchOrgSlug,
         });
 
         if (!response.ok) {
diff --git a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
index 35b8241481..ae155c7e52 100644
--- a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
+++ b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
@@ -10,6 +10,7 @@
 
 import type { ActionCtx } from '../../../_generated/server';
 import { createDebugLog } from '../../../lib/debug_log';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { formatWebResults } from './format_web_results';
 import { getCrawlerServiceUrl } from './get_crawler_service_url';
 
@@ -58,11 +59,12 @@ export interface WebContextResult {
  * @returns Formatted context with citation metadata, or undefined if no results / on failure
  */
 export async function queryWebContext(
-  _ctx: ActionCtx,
-  _organizationId: string,
+  ctx: ActionCtx,
+  organizationId: string,
   query: string,
   limit = DEFAULT_LIMIT,
 ): Promise<WebContextResult | undefined> {
+  const orgSlug = await orgSlugFromId(ctx, organizationId);
   try {
     debugLog('Querying web context', {
       query: query.slice(0, 100),
@@ -79,7 +81,10 @@ export async function queryWebContext(
       const crawlerUrl = getCrawlerServiceUrl();
       const response = await fetch(`${crawlerUrl}/api/v1/search`, {
         method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
+        headers: {
+          'Content-Type': 'application/json',
+          'x-tale-org': orgSlug,
+        },
         body: JSON.stringify({
           query,
           limit,
diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
index 092fc4989a..24cb608dda 100644
--- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts
+++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
@@ -9,6 +9,7 @@ import type { ToolCtx } from '@convex-dev/agent';
 
 import { internal } from '../../../_generated/api';
 import { createDebugLog } from '../../../lib/debug_log';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { formatWebResults } from './format_web_results';
 import { formatWebsiteSummaries } from './format_website_summaries';
 import { getCrawlerServiceUrl } from './get_crawler_service_url';
@@ -42,6 +43,7 @@ export function isValidDomain(domain: string): boolean {
 
 async function fetchSearch(
   crawlerUrl: string,
+  orgSlug: string,
   query: string,
   domain?: string,
 ): Promise<SearchApiResponse> {
@@ -51,7 +53,10 @@ async function fetchSearch(
 
   const response = await fetch(endpoint, {
     method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
+    headers: {
+      'Content-Type': 'application/json',
+      'x-tale-org': orgSlug,
+    },
     body: JSON.stringify({
       query,
       limit: DEFAULT_LIMIT,
@@ -114,7 +119,11 @@ export async function searchPages(
   }
 
   const crawlerUrl = getCrawlerServiceUrl();
-  let data = await fetchSearch(crawlerUrl, args.query, validDomain);
+  if (!ctx.organizationId) {
+    throw new Error('search_pages requires organizationId in ToolCtx.');
+  }
+  const orgSlug = await orgSlugFromId(ctx, ctx.organizationId);
+  let data = await fetchSearch(crawlerUrl, orgSlug, args.query, validDomain);
   let results = data.results;
 
   // Fallback to global search if domain-scoped search returns no results
@@ -124,7 +133,7 @@ export async function searchPages(
       query: args.query,
       domain: validDomain,
     });
-    data = await fetchSearch(crawlerUrl, args.query);
+    data = await fetchSearch(crawlerUrl, orgSlug, args.query);
     results = data.results;
     domainFallback = true;
   }
diff --git a/services/platform/convex/agents/file_utils.ts b/services/platform/convex/agents/file_utils.ts
index dbe0cfe7f7..b6ea26927c 100644
--- a/services/platform/convex/agents/file_utils.ts
+++ b/services/platform/convex/agents/file_utils.ts
@@ -50,7 +50,7 @@ export interface AgentJsonConfig {
   workflows?: string[];
   /**
    * Slugs of skills available to this agent — a hard allowlist. Each slug
-   * references a `${SKILLS_DIR}/<orgSlug>/<slug>/SKILL.md` bundle. Empty or
+   * references a `${TALE_CONFIG_DIR}/<orgSlug>/skills/<slug>/SKILL.md` bundle. Empty or
    * absent means the agent has zero skills available; there is no implicit
    * "all org skills" fallback. At chat-turn start, `buildSkillContext` loads
    * only the intersection of this list with the org's actual skills; slugs
diff --git a/services/platform/convex/branding/file_actions.ts b/services/platform/convex/branding/file_actions.ts
index a10f5369be..5eb6dd94cc 100644
--- a/services/platform/convex/branding/file_actions.ts
+++ b/services/platform/convex/branding/file_actions.ts
@@ -29,6 +29,7 @@ import {
   atomicWrite,
   atomicWriteBuffer,
   generateHistoryTimestamp,
+  errnoCode,
   pruneHistory,
   readFileSafe,
   readJsonFile,
@@ -293,12 +294,21 @@ export const resetBranding = action({
     try {
       const entries = await readdir(imagesDir);
       await Promise.all(
-        entries.map((entry) =>
-          unlink(path.join(imagesDir, entry)).catch(() => {}),
-        ),
+        entries.map((entry) => {
+          const file = path.join(imagesDir, entry);
+          return unlink(file).catch((err) => {
+            // Tolerate ENOENT (race with another deleter) and log
+            // everything else — silent unlink failures hide permission
+            // bugs that leak stale branding images.
+            if (errnoCode(err) === 'ENOENT') return;
+            console.warn(`[resetBranding] unlink ${file} failed:`, err);
+          });
+        }),
       );
-    } catch {
-      // Directory may not exist
+    } catch (err) {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn(`[resetBranding] readdir ${imagesDir} failed:`, err);
+      }
     }
 
     return null;
diff --git a/services/platform/convex/documents/compare_documents.ts b/services/platform/convex/documents/compare_documents.ts
index 9a5507a94f..5bc59da3c2 100644
--- a/services/platform/convex/documents/compare_documents.ts
+++ b/services/platform/convex/documents/compare_documents.ts
@@ -4,6 +4,7 @@ import { internal } from '../_generated/api';
 import { action } from '../_generated/server';
 import { fetchDocumentComparisonByUrls } from '../agent_tools/documents/helpers/fetch_document_comparison';
 import { authComponent } from '../auth';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { toId } from '../lib/type_cast_helpers';
 
 export const compareDocuments = action({
@@ -61,11 +62,13 @@ export const compareDocuments = action({
       resolveStorageUrl(ctx, args.comparisonStorageId),
     ]);
 
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     return await fetchDocumentComparisonByUrls(
       baseFileUrl,
       args.baseFileName,
       compFileUrl,
       args.comparisonFileName,
+      orgSlug,
     );
   },
 });
diff --git a/services/platform/convex/file_metadata/transcribe_audio.ts b/services/platform/convex/file_metadata/transcribe_audio.ts
index 7b95c3f95b..7eb0ef96ac 100644
--- a/services/platform/convex/file_metadata/transcribe_audio.ts
+++ b/services/platform/convex/file_metadata/transcribe_audio.ts
@@ -8,6 +8,7 @@ import type { ActionCtx } from '../_generated/server';
 import { internalAction } from '../_generated/server';
 import { estimateTranscriptionCostCents } from '../governance/cost_estimation';
 import { classifyError } from '../lib/error_classification';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import type { ResolvedModelData } from '../providers/resolve_model';
 import { resolveTranscriptionModel } from '../providers/resolve_model';
 import { uploadFile } from '../workflow_engine/action_defs/rag/helpers/upload_file_direct';
@@ -162,6 +163,7 @@ async function indexTranscriptToRag(
     transcript: string;
     chunkCount: number;
     requestId: string;
+    orgSlug: string;
   },
 ): Promise<void> {
   if (args.transcript.length === 0) return;
@@ -201,6 +203,7 @@ async function indexTranscriptToRag(
         originalAudioContentType: args.audioContentType,
         chunkCount: args.chunkCount,
       },
+      orgSlug: args.orgSlug,
     });
     await ctx.runMutation(
       internal.file_metadata.internal_mutations.updateFileTranscription,
@@ -371,6 +374,7 @@ export const transcribeAudio = internalAction({
           // transcript was cached from). Duplicates content in RAG but
           // keeps per-upload citation identity correct; embeddings cost
           // is tiny compared to the Whisper call we just skipped.
+          const cachedOrgSlug = await orgSlugFromId(ctx, args.organizationId);
           await indexTranscriptToRag(ctx, {
             storageId: args.storageId,
             fileName: args.fileName,
@@ -378,6 +382,7 @@ export const transcribeAudio = internalAction({
             transcript: cached.transcript ?? '',
             chunkCount: 0,
             requestId,
+            orgSlug: cachedOrgSlug,
           });
           return null;
         }
@@ -535,6 +540,7 @@ export const transcribeAudio = internalAction({
         );
       }
 
+      const indexOrgSlug = await orgSlugFromId(ctx, args.organizationId);
       await indexTranscriptToRag(ctx, {
         storageId: args.storageId,
         fileName: args.fileName,
@@ -542,6 +548,7 @@ export const transcribeAudio = internalAction({
         transcript: fullTranscript,
         chunkCount: chunks.length,
         requestId,
+        orgSlug: indexOrgSlug,
       });
 
       return null;
diff --git a/services/platform/convex/governance/mutations.ts b/services/platform/convex/governance/mutations.ts
index c66e92d277..dfb6f65986 100644
--- a/services/platform/convex/governance/mutations.ts
+++ b/services/platform/convex/governance/mutations.ts
@@ -195,7 +195,7 @@ export const upsertPolicy = mutation({
       throw new ConvexError({
         code: 'use_action',
         message:
-          'Use governance/retention_actions.upsertRetentionPolicyAction for retention_policy. The bounds file at $TALE_CONFIG_DIR/retention/{orgSlug}.json must be read before validation.',
+          'Use governance/retention_actions.upsertRetentionPolicyAction for retention_policy. The per-org bounds file at $TALE_CONFIG_DIR/<orgSlug>/retention.json must be read before validation.',
       });
     }
 
diff --git a/services/platform/convex/governance/retention_actions.ts b/services/platform/convex/governance/retention_actions.ts
index c86ef5d146..359b71e370 100644
--- a/services/platform/convex/governance/retention_actions.ts
+++ b/services/platform/convex/governance/retention_actions.ts
@@ -5,9 +5,10 @@
  * delegated to `internal.lib.config_store.actions` via `ctx.runAction`.
  *
  * Why actions and not a query:
- *   - Bounds live in `$TALE_CONFIG_DIR/retention/{orgSlug}.json`. V8
- *     queries/mutations cannot read fs and cannot await a Node action
- *     inline. Only V8 actions can `ctx.runAction(internal nodeAction)`.
+ *   - Bounds live in `$TALE_CONFIG_DIR/<orgSlug>/retention.json` under
+ *     the org-first layout. V8 queries/mutations cannot read fs and
+ *     cannot await a Node action inline. Only V8 actions can
+ *     `ctx.runAction(internal nodeAction)`.
  *   - Bounds change rarely (operator edits the file or env), so losing
  *     query reactivity is acceptable. The frontend uses TanStack Query
  *     to one-shot fetch on editor open.
diff --git a/services/platform/convex/governance/retention_bounds_proposal.ts b/services/platform/convex/governance/retention_bounds_proposal.ts
index b4975f2eee..c160c8bda9 100644
--- a/services/platform/convex/governance/retention_bounds_proposal.ts
+++ b/services/platform/convex/governance/retention_bounds_proposal.ts
@@ -1,8 +1,9 @@
 /**
  * Public V8 actions for the operator-side retention bounds proposal
- * gate. The JSON file under `$TALE_CONFIG_DIR/retention/{orgSlug}.json`
- * (and `TALE_RETENTION_*` env tightening) are no longer directives —
- * they're proposals. Cleanup uses `retentionAppliedBounds.appliedBounds`,
+ * gate. The JSON file under `$TALE_CONFIG_DIR/<orgSlug>/retention.json`
+ * (org-first layout) and `TALE_RETENTION_*` env tightening are no
+ * longer directives — they're proposals. Cleanup uses
+ * `retentionAppliedBounds.appliedBounds`,
  * which only changes when an admin clicks Apply here.
  *
  * Three actions:
diff --git a/services/platform/convex/governance/retention_floors.ts b/services/platform/convex/governance/retention_floors.ts
index f34802c0c8..ab789a20e9 100644
--- a/services/platform/convex/governance/retention_floors.ts
+++ b/services/platform/convex/governance/retention_floors.ts
@@ -4,8 +4,9 @@
  * importable from V8 mutations / queries / actions.
  *
  * Resolution order:
- *   1. **Per-org file** at `$TALE_CONFIG_DIR/retention/{orgSlug}.json`
- *      provides the baseline `{ min, max, default }` per category. The
+ *   1. **Per-org file** at `$TALE_CONFIG_DIR/<orgSlug>/retention.json`
+ *      (org-first layout) provides the baseline `{ min, max, default }`
+ *      per category. The
  *      file is the canonical source of truth (no in-code fallback).
  *      Loading the file is the caller's responsibility — Node-side
  *      callers (cleanup action) import the store directly; V8-side
diff --git a/services/platform/convex/integrations/credentials_schema.ts b/services/platform/convex/integrations/credentials_schema.ts
index 3bce59e10a..cdcd1753b8 100644
--- a/services/platform/convex/integrations/credentials_schema.ts
+++ b/services/platform/convex/integrations/credentials_schema.ts
@@ -7,8 +7,9 @@ import { jsonRecordValidator } from '../lib/validators/json';
  * Slim credentials table for installed integrations.
  *
  * Integration definitions (operations, connector code, config) live in filesystem
- * files under INTEGRATIONS_DIR. This table stores only per-installation runtime
- * data: encrypted credentials, status, health metrics, and icon storage.
+ * files under `$TALE_CONFIG_DIR/<orgSlug>/integrations/<slug>/`. This table
+ * stores only per-installation runtime data: encrypted credentials, status,
+ * health metrics, and icon storage.
  *
  * The `slug` field matches the integration directory name (the canonical identifier).
  */
diff --git a/services/platform/convex/integrations/load_integration.ts b/services/platform/convex/integrations/load_integration.ts
index 665d1a4ed2..675510291f 100644
--- a/services/platform/convex/integrations/load_integration.ts
+++ b/services/platform/convex/integrations/load_integration.ts
@@ -4,7 +4,7 @@
  * Unified integration loader.
  *
  * Loads integration data from two sources:
- * 1. File system (INTEGRATIONS_DIR): config.json + connector.ts
+ * 1. File system (`$TALE_CONFIG_DIR/<orgSlug>/integrations/<slug>/`): config.json + connector.ts
  * 2. Database (integrationCredentials table): encrypted credentials, status, health
  *
  * Merges them into a `LoadedIntegration` object that matches the shape consumers
diff --git a/services/platform/convex/lib/agent_response/generate_response.ts b/services/platform/convex/lib/agent_response/generate_response.ts
index 24d9c35fe1..edfb61baa2 100644
--- a/services/platform/convex/lib/agent_response/generate_response.ts
+++ b/services/platform/convex/lib/agent_response/generate_response.ts
@@ -54,6 +54,7 @@ import {
   RECOVERY_TIMEOUT_MS,
   estimateTokens,
 } from '../context_management';
+import { orgSlugFromId } from '../helpers/org_slug';
 // Artifacts module removed — workspace context is discoverable via the
 // `file_list` tool. We keep the call sites but route them through this
 // no-op shim so the prompt-builder API surface stays intact.
@@ -703,13 +704,14 @@ export async function generateAgentResponse(
       if (accessibleFileIds.length === 0) {
         debugLog('No accessible RAG documents, skipping knowledge context');
       } else {
+        const orgSlug = await orgSlugFromId(ctx, organizationId);
         knowledgeContextPromise = queryRagContext(
           promptMessage,
           undefined,
           undefined,
           undefined,
           undefined,
-          { fileIds: accessibleFileIds },
+          { fileIds: accessibleFileIds, orgSlug },
         );
         debugLog('Knowledge context query started', {
           threadId,
diff --git a/services/platform/convex/lib/config_store/actions.ts b/services/platform/convex/lib/config_store/actions.ts
index 24adaafa1f..9e784f24d0 100644
--- a/services/platform/convex/lib/config_store/actions.ts
+++ b/services/platform/convex/lib/config_store/actions.ts
@@ -24,7 +24,6 @@ import { createFileConfigStore } from './store';
 const retentionStore = createFileConfigStore<RetentionDefaultsConfig>(
   'retention',
   retentionDefaultsConfigSchema,
-  { orgFirst: true },
 );
 
 export const readRetentionConfig = internalAction({
diff --git a/services/platform/convex/lib/config_store/store.test.ts b/services/platform/convex/lib/config_store/store.test.ts
index 6b82dfb7ca..46857c5661 100644
--- a/services/platform/convex/lib/config_store/store.test.ts
+++ b/services/platform/convex/lib/config_store/store.test.ts
@@ -28,6 +28,18 @@ afterEach(async () => {
   await rm(tmpRoot, { recursive: true, force: true });
 });
 
+// Org-first layout: each org's area file lives at
+// `<root>/<orgSlug>/<area>.json`.
+async function writeOrgAreaFile(
+  orgSlug: string,
+  area: string,
+  content: string,
+): Promise<void> {
+  const dir = path.join(tmpRoot, orgSlug);
+  await mkdir(dir, { recursive: true });
+  await writeFile(path.join(dir, `${area}.json`), content);
+}
+
 describe('createFileConfigStore', () => {
   it('read returns null for missing file', async () => {
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
@@ -36,10 +48,9 @@ describe('createFileConfigStore', () => {
   });
 
   it('read parses + validates a valid file', async () => {
-    const dir = path.join(tmpRoot, 'thing');
-    await mkdir(dir, { recursive: true });
-    await writeFile(
-      path.join(dir, 'default.json'),
+    await writeOrgAreaFile(
+      'default',
+      'thing',
       JSON.stringify({ foo: 'bar', n: 42 }),
     );
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
@@ -48,18 +59,15 @@ describe('createFileConfigStore', () => {
   });
 
   it('read throws on corrupted JSON', async () => {
-    const dir = path.join(tmpRoot, 'thing');
-    await mkdir(dir, { recursive: true });
-    await writeFile(path.join(dir, 'default.json'), '{ not valid json');
+    await writeOrgAreaFile('default', 'thing', '{ not valid json');
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
     await expect(store.read('default')).rejects.toThrow();
   });
 
   it('read throws on schema violation', async () => {
-    const dir = path.join(tmpRoot, 'thing');
-    await mkdir(dir, { recursive: true });
-    await writeFile(
-      path.join(dir, 'default.json'),
+    await writeOrgAreaFile(
+      'default',
+      'thing',
       JSON.stringify({ foo: 123 }), // foo must be string
     );
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
@@ -81,22 +89,23 @@ describe('createFileConfigStore', () => {
     ).rejects.toThrow(/Refusing to write invalid/);
   });
 
-  it('list returns slugs of present *.json files', async () => {
-    const dir = path.join(tmpRoot, 'thing');
-    await mkdir(dir, { recursive: true });
-    await writeFile(path.join(dir, 'default.json'), '{}');
-    await writeFile(path.join(dir, 'marketing.json'), '{}');
-    await writeFile(path.join(dir, 'engineering.json'), '{}');
-    // Non-json + dotfile should be ignored
-    await writeFile(path.join(dir, 'notes.txt'), 'ignored');
-    await writeFile(path.join(dir, '.history.json'), 'ignored');
+  it('list returns slugs of orgs with a <area>.json file', async () => {
+    await writeOrgAreaFile('default', 'thing', '{}');
+    await writeOrgAreaFile('marketing', 'thing', '{}');
+    await writeOrgAreaFile('engineering', 'thing', '{}');
+    // An org without the area file should not appear.
+    await mkdir(path.join(tmpRoot, 'unrelated'), { recursive: true });
+    await writeFile(path.join(tmpRoot, 'unrelated', 'other.json'), '{}');
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
     const list = await store.list();
     const slugs = list.map((e) => e.orgSlug).sort();
     expect(slugs).toEqual(['default', 'engineering', 'marketing']);
   });
 
-  it('list returns empty array when area dir does not exist', async () => {
+  it('list returns empty array when config root does not exist', async () => {
+    // Stub to a non-existent path so the readdir() in list() takes the
+    // ENOENT branch.
+    await rm(tmpRoot, { recursive: true, force: true });
     const store = createFileConfigStore<TestConfig>('thing', testSchema);
     const list = await store.list();
     expect(list).toEqual([]);
diff --git a/services/platform/convex/lib/config_store/store.ts b/services/platform/convex/lib/config_store/store.ts
index 29b0af8294..1cbb51c6af 100644
--- a/services/platform/convex/lib/config_store/store.ts
+++ b/services/platform/convex/lib/config_store/store.ts
@@ -3,13 +3,9 @@
 /**
  * Generic typed read/write helper for area-specific JSON config files.
  *
- * Two layout shapes are supported, selected via `orgFirst`:
- *
- * - `orgFirst: false` (default): `$TALE_CONFIG_DIR/{area}/{orgSlug}.json`.
- *   The legacy per-area-dir shape; org slugs live in the filename.
- * - `orgFirst: true`: `$TALE_CONFIG_DIR/{orgSlug}/{area}.json`.
- *   Used by retention under the uniform org-first layout — each org has
- *   one file per area, alongside its `agents/`, `providers/`, etc.
+ * Path shape is the uniform org-first layout:
+ * `$TALE_CONFIG_DIR/<orgSlug>/<area>.json`. Each org has one file per
+ * area, alongside its `agents/`, `providers/`, etc.
  *
  * Wraps `readJsonFile` + `atomicWrite` so callers don't reinvent path
  * resolution, symlink/size guards, or atomic-rename semantics.
@@ -38,7 +34,6 @@ import type { z } from 'zod/v4';
 import { atomicWrite, readJsonFile, validateOrgSlug } from '../file_io';
 
 const MAX_FILE_SIZE_BYTES = 256 * 1024;
-const ORG_FILE_REGEX = /^[a-z0-9][a-z0-9_-]*\.json$/;
 
 export interface ConfigStore<T> {
   /**
@@ -53,16 +48,6 @@ export interface ConfigStore<T> {
   list(): Promise<Array<{ orgSlug: string }>>;
 }
 
-export interface CreateFileConfigStoreOptions {
-  /**
-   * When true, paths follow the org-first layout:
-   * `$TALE_CONFIG_DIR/<orgSlug>/<area>.json`. List enumerates per-org
-   * directories that contain `<area>.json`. When false (default), paths
-   * follow `$TALE_CONFIG_DIR/<area>/<orgSlug>.json`.
-   */
-  orgFirst?: boolean;
-}
-
 function getConfigRoot(area: string): string {
   const configDir = process.env.TALE_CONFIG_DIR;
   if (!configDir) {
@@ -76,17 +61,13 @@ function getConfigRoot(area: string): string {
   return configDir;
 }
 
-function resolveFilePath(
-  area: string,
-  orgSlug: string,
-  orgFirst: boolean,
-): string {
+function resolveFilePath(area: string, orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
   const root = getConfigRoot(area);
-  const dir = orgFirst ? path.join(root, orgSlug) : path.join(root, area);
-  const fileName = orgFirst ? `${area}.json` : `${orgSlug}.json`;
+  const dir = path.join(root, orgSlug);
+  const fileName = `${area}.json`;
   const resolved = path.resolve(dir, fileName);
   const expectedPrefix = path.resolve(dir);
   if (
@@ -106,10 +87,7 @@ function resolveFilePath(
 export function createFileConfigStore<T>(
   area: string,
   schema: z.ZodType<T>,
-  options: CreateFileConfigStoreOptions = {},
 ): ConfigStore<T> {
-  const orgFirst = options.orgFirst ?? false;
-
   const parse = (content: string): T => {
     // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- raw JSON before Zod validation
     const parsed = JSON.parse(content) as unknown;
@@ -122,17 +100,16 @@ export function createFileConfigStore<T>(
 
   return {
     async read(orgSlug) {
-      const filePath = resolveFilePath(area, orgSlug, orgFirst);
+      const filePath = resolveFilePath(area, orgSlug);
       const result = await readJsonFile(filePath, MAX_FILE_SIZE_BYTES, parse);
       if (result.ok) return result.data;
       if (result.error === 'not_found') return null;
-      const display = orgFirst
-        ? `${orgSlug}/${area}.json`
-        : `${area}/${orgSlug}.json`;
-      throw new Error(`Failed to read ${display}: ${result.message}`);
+      throw new Error(
+        `Failed to read ${orgSlug}/${area}.json: ${result.message}`,
+      );
     },
     async write(orgSlug, value) {
-      const filePath = resolveFilePath(area, orgSlug, orgFirst);
+      const filePath = resolveFilePath(area, orgSlug);
       // Re-parse before write to surface schema errors to the caller
       // rather than silently corrupting the file. Cheap relative to fs.
       const parsed = schema.safeParse(value);
@@ -146,44 +123,27 @@ export function createFileConfigStore<T>(
     },
     async list() {
       const root = getConfigRoot(area);
-      if (orgFirst) {
-        // Each org's file lives at `<root>/<orgSlug>/<area>.json`.
-        // Enumerate org subdirs (validated by slug regex) and probe each
-        // for the area file. Missing root → return empty rather than
-        // throwing — operator hasn't seeded anything yet.
-        let entries: string[];
-        try {
-          entries = await readdir(root);
-        } catch (err) {
-          if (err instanceof Error && 'code' in err && err.code === 'ENOENT') {
-            return [];
-          }
-          throw err;
-        }
-        const results: Array<{ orgSlug: string }> = [];
-        for (const name of entries) {
-          if (!validateOrgSlug(name)) continue;
-          const filePath = path.join(root, name, `${area}.json`);
-          const info = await stat(filePath).catch(() => null);
-          if (info?.isFile()) results.push({ orgSlug: name });
-        }
-        return results;
-      }
-
-      // Legacy per-area-dir layout: list `*.json` files under `<root>/<area>/`.
-      const dir = path.join(root, area);
+      // Each org's file lives at `<root>/<orgSlug>/<area>.json`.
+      // Enumerate org subdirs (validated by slug regex) and probe each
+      // for the area file. Missing root → return empty rather than
+      // throwing — operator hasn't seeded anything yet.
       let entries: string[];
       try {
-        entries = await readdir(dir);
+        entries = await readdir(root);
       } catch (err) {
         if (err instanceof Error && 'code' in err && err.code === 'ENOENT') {
           return [];
         }
         throw err;
       }
-      return entries
-        .filter((name) => ORG_FILE_REGEX.test(name))
-        .map((name) => ({ orgSlug: name.slice(0, -'.json'.length) }));
+      const results: Array<{ orgSlug: string }> = [];
+      for (const name of entries) {
+        if (!validateOrgSlug(name)) continue;
+        const filePath = path.join(root, name, `${area}.json`);
+        const info = await stat(filePath).catch(() => null);
+        if (info?.isFile()) results.push({ orgSlug: name });
+      }
+      return results;
     },
   };
 }
diff --git a/services/platform/convex/lib/helpers/org_slug.ts b/services/platform/convex/lib/helpers/org_slug.ts
new file mode 100644
index 0000000000..61f8366274
--- /dev/null
+++ b/services/platform/convex/lib/helpers/org_slug.ts
@@ -0,0 +1,48 @@
+/**
+ * Look up the `slug` for an organization given its Better Auth `_id`.
+ *
+ * RAG/crawler require the slug (not the id) on the X-Tale-Org header
+ * because their per-org provider catalog is keyed by slug on disk at
+ * `$TALE_CONFIG_DIR/<orgSlug>/providers/`. Most Convex action contexts
+ * carry `organizationId`; this helper bridges to the slug.
+ */
+
+import { getString, isRecord } from '../../../lib/utils/type-guards';
+import { components } from '../../_generated/api';
+
+// Loose ctx shape so all of: Convex ActionCtx, ToolCtx, query/mutation
+// ctxs can pass through. The runQuery signature on the real Convex
+// types is generic over FunctionReference — using a narrower stub here
+// would force every caller to cast.
+type CtxWithRunQuery = {
+  // oxlint-disable-next-line typescript/no-explicit-any -- structural-only typing for cross-ctx compatibility
+  runQuery: (...args: any[]) => Promise<unknown>;
+};
+
+/**
+ * Resolve an organizationId to its slug via Better Auth.
+ *
+ * Throws if no matching org row exists — callers should ensure the
+ * organizationId came from a verified-membership check upstream.
+ */
+export async function orgSlugFromId(
+  ctx: CtxWithRunQuery,
+  organizationId: string,
+): Promise<string> {
+  const row = await ctx.runQuery(components.betterAuth.adapter.findOne, {
+    model: 'organization',
+    where: [{ field: '_id', value: organizationId, operator: 'eq' }],
+  });
+  if (!isRecord(row)) {
+    throw new Error(
+      `[orgSlugFromId] no organization row found for id ${JSON.stringify(organizationId)}`,
+    );
+  }
+  const slug = getString(row, 'slug');
+  if (!slug) {
+    throw new Error(
+      `[orgSlugFromId] organization ${JSON.stringify(organizationId)} has no slug`,
+    );
+  }
+  return slug;
+}
diff --git a/services/platform/convex/lib/helpers/rag_config.ts b/services/platform/convex/lib/helpers/rag_config.ts
index 74a817c35b..c4ab9ade96 100644
--- a/services/platform/convex/lib/helpers/rag_config.ts
+++ b/services/platform/convex/lib/helpers/rag_config.ts
@@ -194,20 +194,34 @@ export function _resetRagConfigForTests(): void {
 }
 
 /**
- * Fetch against the RAG service. Sets `Authorization: Bearer ${authToken}`
- * when `RAG_AUTH_TOKEN` is configured; otherwise sends no Authorization
- * header (RAG runs open). Applies a default per-request timeout and
- * accepts a path starting with `/`.
+ * Fetch against the RAG service.
+ *
+ * Sets `Authorization: Bearer ${authToken}` when `RAG_AUTH_TOKEN` is
+ * configured; otherwise sends no Authorization header (RAG runs open).
+ *
+ * `orgSlug` is required for endpoints whose service-side handler reads
+ * the org's provider catalog (search, generate, upload, compare-files).
+ * The RAG service enforces this via per-router `Depends(require_org_slug)`,
+ * so callers MUST pass `orgSlug` for those endpoints — a missing header
+ * yields 400 from RAG. Status / delete / content / compare-by-id
+ * endpoints are org-agnostic and accept calls without the header.
+ *
+ * When `orgSlug` is supplied, it sets `X-Tale-Org: ${orgSlug}` and
+ * cannot be overridden via a header in `init.headers` — preventing
+ * a caller from spoofing another org's identity.
  *
  * Works in both V8 and Node Convex runtimes (uses the global `fetch`).
  *
  * @example
- *   const res = await ragFetch('/api/v1/documents/abc', { method: 'DELETE' });
- *   if (res.status === 404 || res.ok) { ...treat as success... }
+ *   const res = await ragFetch('/api/v1/search', {
+ *     method: 'POST',
+ *     body: JSON.stringify(payload),
+ *     orgSlug: 'acme',
+ *   });
  */
 export async function ragFetch(
   path: string,
-  init: RequestInit & { timeoutMs?: number } = {},
+  init: RequestInit & { timeoutMs?: number; orgSlug?: string } = {},
 ): Promise<Response> {
   const { serviceUrl, authToken } = getRagConfig();
   // The legacy `path.startsWith('http')` override branch was a future-bypass
@@ -231,11 +245,19 @@ export async function ragFetch(
   if (authToken !== undefined && !headers.has('authorization')) {
     headers.set('authorization', `Bearer ${authToken}`);
   }
+  // When supplied, always overwrite — callers must not be able to
+  // spoof another org's identity by setting the header in `init.headers`
+  // directly. When omitted, the RAG endpoint either runs org-agnostic
+  // (status/delete/content/compare-by-id) or returns 400 from its
+  // `Depends(require_org_slug)` dep (search/generate/upload/compare-files).
+  if (init.orgSlug) {
+    headers.set('x-tale-org', init.orgSlug);
+  }
 
   const timeoutMs = init.timeoutMs ?? 10_000;
   const signal = init.signal ?? AbortSignal.timeout(timeoutMs);
 
-  const { timeoutMs: _drop, ...rest } = init;
+  const { timeoutMs: _drop, orgSlug: _dropOrg, ...rest } = init;
   // `redirect: 'manual'` so a compromised RAG returning a 30x to
   // `http://169.254.169.254/...` (cloud IMDS) doesn't get auto-followed
   // past the SSRF guard. Callers handle 30x as a hard error. Round-2 v15 F1.
diff --git a/services/platform/convex/migrations/rename_org_slug.ts b/services/platform/convex/migrations/rename_org_slug.ts
deleted file mode 100644
index 8439ef3d89..0000000000
--- a/services/platform/convex/migrations/rename_org_slug.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Migration: Rename all organization slugs to "default".
- *
- * Self-hosted Tale deployments use a single organization.
- * This migration normalizes the slug to "default" for consistency.
- *
- * Idempotent: skips organizations that already have slug "default".
- *
- * Usage:
- *   bunx convex run migrations/rename_org_slug:renameOrgSlug
- */
-
-import { isRecord, getString } from '../../lib/utils/type-guards';
-import { components } from '../_generated/api';
-import { internalMutation } from '../_generated/server';
-
-const TARGET_SLUG = 'default';
-const TARGET_NAME = 'Default';
-
-export const renameOrgSlug = internalMutation({
-  args: {},
-  handler: async (ctx) => {
-    const result = await ctx.runQuery(components.betterAuth.adapter.findMany, {
-      model: 'organization',
-      paginationOpts: { cursor: null, numItems: 100 },
-      where: [],
-    });
-
-    const orgs =
-      result &&
-      typeof result === 'object' &&
-      'page' in result &&
-      Array.isArray(result.page)
-        ? result.page
-        : [];
-
-    let updated = 0;
-    let skipped = 0;
-
-    for (const org of orgs) {
-      if (!isRecord(org)) continue;
-
-      const id = getString(org, '_id');
-      const slug = getString(org, 'slug');
-
-      if (!id) continue;
-
-      if (slug === TARGET_SLUG) {
-        skipped++;
-        continue;
-      }
-
-      await ctx.runMutation(components.betterAuth.adapter.updateMany, {
-        input: {
-          model: 'organization',
-          where: [{ field: '_id', value: id, operator: 'eq' }],
-          update: { slug: TARGET_SLUG, name: TARGET_NAME },
-        },
-        paginationOpts: { cursor: null, numItems: 1 },
-      });
-      updated++;
-      console.log(
-        `Updated organization ${id}: slug "${slug}" → "${TARGET_SLUG}"`,
-      );
-    }
-
-    console.log(`Done. Updated: ${updated}, Skipped: ${skipped}`);
-  },
-});
diff --git a/services/platform/convex/organizations/reseed_all_orgs.ts b/services/platform/convex/organizations/reseed_all_orgs.ts
index 4bb196cf58..19361126b5 100644
--- a/services/platform/convex/organizations/reseed_all_orgs.ts
+++ b/services/platform/convex/organizations/reseed_all_orgs.ts
@@ -1,24 +1,34 @@
 /**
- * Operator-triggered re-seed: enumerate every org (incl. `default`) and
- * re-invoke `scaffoldNewOrganization` with `override:true`. Driven by
- * `tale deploy --override-all` via `bunx convex run organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin`.
+ * Operator-triggered re-seed: enumerate every registered org (incl.
+ * `default`) and re-invoke `scaffoldNewOrganization({override:true,
+ * strict:true})`. Driven by `tale deploy --override-all` via
+ * `bunx convex run organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin`.
  *
  * Semantics:
  *   - Always reseeds `default` even if absent from the org list (canonical
  *     template org).
- *   - Per-org try/catch: one failure logs + continues; the full result
- *     map is returned so the CLI surfaces succeeded/failed counts and
- *     exits non-zero on any failure.
+ *   - Per-org try/catch records errors into the result map AND THEN the
+ *     action throws at the end if any org failed, so `bunx convex run`
+ *     exits non-zero. Without the final throw, the CLI would see exit-0
+ *     from docker exec and report success on partial failure.
+ *   - Per-org call uses `strict:true` so scaffold's per-domain failures
+ *     surface as a thrown error here (instead of silent
+ *     `console.error`-and-continue).
  *   - Deterministic order: collected slugs are sorted before processing
  *     so logs and partial-failure reruns are reproducible.
  *   - Cursor-paginated org enumeration (200/page) instead of the
- *     500-page-cap pattern in older backfills — avoids silently capping
- *     deployments with many orgs.
+ *     500-page-cap pattern in older backfills.
+ *
+ * Note: enumerates Better Auth `organization` rows. Filesystem-only org
+ * subtrees (no DB row) are intentionally skipped — `--override-all` is
+ * "reseed all registered orgs", not "reseed every dir on disk".
  *
  * Note: this is an ops re-runnable tool, not a one-shot migration. Lives
  * next to `scaffold.ts` (the thing it reinvokes), not in `migrations/`.
  */
 
+import { v } from 'convex/values';
+
 import { getString, isRecord } from '../../lib/utils/type-guards';
 import { components, internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
@@ -37,6 +47,21 @@ type OrgReseedResult =
 
 export const reseedAllOrgsFromBuiltin = internalAction({
   args: {},
+  returns: v.object({
+    total: v.number(),
+    succeeded: v.number(),
+    failed: v.number(),
+    results: v.array(
+      v.union(
+        v.object({ slug: v.string(), status: v.literal('ok') }),
+        v.object({
+          slug: v.string(),
+          status: v.literal('error'),
+          error: v.string(),
+        }),
+      ),
+    ),
+  }),
   handler: async (ctx) => {
     const slugSet = new Set<string>(['default']);
 
@@ -79,7 +104,7 @@ export const reseedAllOrgsFromBuiltin = internalAction({
       try {
         await ctx.runAction(
           internal.organizations.scaffold.scaffoldNewOrganization,
-          { orgSlug: slug, override: true },
+          { orgSlug: slug, override: true, strict: true },
         );
         results.push({ slug, status: 'ok' });
         console.log(`[reseedAllOrgs] reseeded "${slug}"`);
@@ -96,6 +121,24 @@ export const reseedAllOrgsFromBuiltin = internalAction({
       `[reseedAllOrgs] done: total=${results.length} succeeded=${succeeded} failed=${failed}`,
     );
 
+    // CRITICAL: throw on any per-org failure so `bunx convex run` exits
+    // non-zero. The aggregated `results` are also printed to console
+    // above so per-org detail survives. Without this throw, the CLI
+    // wrapper sees exit-0 from `docker exec` and reports
+    // `success('Reseed complete.')` on partial failure.
+    if (failed > 0) {
+      const failedSlugs = results
+        .filter(
+          (r): r is Extract<OrgReseedResult, { status: 'error' }> =>
+            r.status === 'error',
+        )
+        .map((r) => `${r.slug} (${r.error.split('\n')[0]})`)
+        .join(', ');
+      throw new Error(
+        `reseedAllOrgs: ${failed}/${results.length} orgs failed — ${failedSlugs}`,
+      );
+    }
+
     return {
       total: results.length,
       succeeded,
diff --git a/services/platform/convex/organizations/scaffold.ts b/services/platform/convex/organizations/scaffold.ts
index e784d3b6e5..61e47391c0 100644
--- a/services/platform/convex/organizations/scaffold.ts
+++ b/services/platform/convex/organizations/scaffold.ts
@@ -28,6 +28,7 @@
  * fail with ENOENT rather than racing the recursive delete.
  */
 
+import { randomUUID } from 'node:crypto';
 import {
   lstat,
   readdir,
@@ -58,6 +59,12 @@ import { resolveWorkflowsDir } from '../workflows/file_utils';
 
 type DirResolver = (orgSlug: string) => string;
 
+export type DomainResult = {
+  domain: string;
+  ok: boolean;
+  error?: string;
+};
+
 type Domain = {
   name: string;
   resolve: DirResolver;
@@ -129,7 +136,10 @@ async function pathsOverlap(a: string, b: string): Promise<boolean> {
   const resolveReal = async (p: string): Promise<string> => {
     try {
       return await realpath(p);
-    } catch {
+    } catch (err) {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn('[scaffold.pathsOverlap] realpath failed:', p, err);
+      }
       return path.resolve(p);
     }
   };
@@ -222,15 +232,19 @@ async function copyTree(
 /**
  * Seed a single domain for an org. Source is `<catalogRoot>/default/<domain>`
  * (canonical template) when `TALE_CONFIG_BUILTIN_DIR` is set, falling back
- * to `resolve('default')` for local dev. Returns true on success, false on
- * skip/failure.
+ * to `resolve('default')` for local dev.
+ *
+ * Returns `{ok:true}` on success (including the legitimate
+ * "already scaffolded, skipped" case) and `{ok:false, error}` on
+ * real failure so the handler can surface or aggregate. Per-domain
+ * errors are also logged here for operator visibility.
  */
 async function seedDomain(
   domain: Domain,
   catalogRoot: string | undefined,
   orgSlug: string,
   override: boolean,
-): Promise<void> {
+): Promise<DomainResult> {
   const sourceDir = catalogRoot
     ? path.join(catalogRoot, 'default', domain.name)
     : domain.resolve('default');
@@ -238,24 +252,23 @@ async function seedDomain(
 
   if (catalogRoot) {
     // Operator-set catalog path must exist; missing = deploy misconfig
-    // (platform/convex image version skew). Surface in logs instead of
-    // silent zero-seed.
+    // (platform/convex image version skew). Surface in logs AND return
+    // an error so reseed-all-orgs can fail loudly.
+    let statErr: unknown;
     const sourceExists = await stat(sourceDir)
       .then(() => true)
       .catch((err) => {
-        if (errnoCode(err) === 'ENOENT') {
-          console.error(
-            `[scaffold] ${domain.name}: ${BUILTIN_ENV}=${catalogRoot} is set but ${sourceDir} does not exist; org "${orgSlug}" will receive zero seed data for this domain`,
-          );
-        } else {
-          console.error(
-            `[scaffold] ${domain.name}: stat ${sourceDir} failed:`,
-            err instanceof Error ? err.message : err,
-          );
-        }
+        statErr = err;
         return false;
       });
-    if (!sourceExists) return;
+    if (!sourceExists) {
+      const msg =
+        errnoCode(statErr) === 'ENOENT'
+          ? `${BUILTIN_ENV}=${catalogRoot} is set but ${sourceDir} does not exist`
+          : `stat ${sourceDir} failed: ${statErr instanceof Error ? statErr.message : String(statErr)}`;
+      console.error(`[scaffold] ${domain.name}: ${msg}`);
+      return { domain: domain.name, ok: false, error: msg };
+    }
   }
 
   // copy-onto-self guard: realpath-aware. Fires for default-org reseed
@@ -265,7 +278,7 @@ async function seedDomain(
     console.warn(
       `[scaffold] ${domain.name}: source and target overlap (${sourceDir} ↔ ${targetDir}); skipping`,
     );
-    return;
+    return { domain: domain.name, ok: true };
   }
 
   if (!override) {
@@ -274,7 +287,7 @@ async function seedDomain(
       console.warn(
         `[scaffold] ${domain.name}: target ${targetDir} already has files, skipping (use override:true to reseed)`,
       );
-      return;
+      return { domain: domain.name, ok: true };
     }
   }
 
@@ -294,7 +307,8 @@ async function seedDomain(
       try {
         bundles = await readdir(sourceDir);
       } catch (err) {
-        if (errnoCode(err) === 'ENOENT') return;
+        if (errnoCode(err) === 'ENOENT')
+          return { domain: domain.name, ok: true };
         throw err;
       }
       for (const bundleName of bundles) {
@@ -302,12 +316,47 @@ async function seedDomain(
         if (SKIP_DIR_NAMES.has(bundleName)) continue;
         const bundleSrc = path.join(sourceDir, bundleName);
         const bundleDst = path.join(targetDir, bundleName);
-        const info = await lstat(bundleSrc).catch(() => null);
+        const info = await lstat(bundleSrc).catch((err) => {
+          if (errnoCode(err) !== 'ENOENT') {
+            console.warn(
+              `[scaffold] ${domain.name}: lstat ${bundleSrc} failed:`,
+              err,
+            );
+          }
+          return null;
+        });
         if (!info || info.isSymbolicLink() || !info.isDirectory()) continue;
         if (override) {
-          await rm(bundleDst, { recursive: true, force: true });
+          // Write into a sibling staging dir then atomic-rename onto the
+          // target. Eliminates the "rm before copy" window where an
+          // interrupt would leave an empty bundle on disk. `force` dropped
+          // so EACCES / EBUSY surface as real errors. The cleanup-on-exit
+          // path below also drops the staging dir to avoid leakage.
+          const staging = `${bundleDst}.staging-${randomUUID().slice(0, 8)}`;
+          try {
+            await copyTree(bundleSrc, staging, /* allowSubdirs */ true);
+            // Best-effort old-dir removal before rename. If the old dir
+            // exists and is non-empty, `rename` will fail on most platforms
+            // — surface that.
+            await rm(bundleDst, { recursive: true }).catch((err) => {
+              if (errnoCode(err) !== 'ENOENT') throw err;
+            });
+            await rename(staging, bundleDst);
+          } catch (err) {
+            // If anything went wrong, scrub the staging dir.
+            await rm(staging, { recursive: true }).catch((scrubErr) => {
+              if (errnoCode(scrubErr) !== 'ENOENT') {
+                console.warn(
+                  `[scaffold] ${domain.name}: failed to scrub staging ${staging}:`,
+                  scrubErr,
+                );
+              }
+            });
+            throw err;
+          }
+        } else {
+          await copyTree(bundleSrc, bundleDst, /* allowSubdirs */ true);
         }
-        await copyTree(bundleSrc, bundleDst, /* allowSubdirs */ true);
       }
     } else {
       // 'tree' — workflows + branding. Per-file overwrite, no rm. User-only
@@ -316,65 +365,120 @@ async function seedDomain(
       await copyTree(sourceDir, targetDir, /* allowSubdirs */ true);
     }
   } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
     console.error(
       `[scaffold] ${domain.name}: copy failed for org "${orgSlug}":`,
-      err instanceof Error ? err.message : err,
+      message,
     );
-    // Continue with other domains; partial scaffolding is better than none.
+    return { domain: domain.name, ok: false, error: message };
   }
+
+  return { domain: domain.name, ok: true };
 }
 
 /**
  * Retention is one JSON object per org (`<orgSlug>/retention.json`), not a
- * subtree. Special-cased outside the DOMAINS loop.
+ * subtree. Special-cased outside the DOMAINS loop. Returns a `DomainResult`
+ * shaped like seedDomain's so the handler can aggregate uniformly.
+ *
+ * Assumes `TALE_CONFIG_DIR` is set + absolute (validated by the handler).
  */
 async function seedRetention(
   catalogRoot: string | undefined,
+  configRoot: string,
   orgSlug: string,
   override: boolean,
-): Promise<void> {
+): Promise<DomainResult> {
   const sourceFile = catalogRoot
     ? path.join(catalogRoot, 'default', 'retention.json')
-    : path.join(process.env.TALE_CONFIG_DIR ?? '', 'default', 'retention.json');
-  const targetFile = path.join(
-    process.env.TALE_CONFIG_DIR ?? '',
-    orgSlug,
-    'retention.json',
-  );
+    : path.join(configRoot, 'default', 'retention.json');
+  const targetFile = path.join(configRoot, orgSlug, 'retention.json');
 
+  let statErr: unknown;
   const sourceExists = await stat(sourceFile)
     .then(() => true)
     .catch((err) => {
-      if (errnoCode(err) !== 'ENOENT') {
-        console.warn('[scaffold] retention: stat failed:', sourceFile, err);
-      }
+      statErr = err;
       return false;
     });
-  if (!sourceExists) return;
+  if (!sourceExists) {
+    if (errnoCode(statErr) === 'ENOENT') {
+      // Missing catalog retention is expected in some test fixtures; treat
+      // as no-op (no error to propagate).
+      return { domain: 'retention', ok: true };
+    }
+    const msg = `stat ${sourceFile} failed: ${statErr instanceof Error ? statErr.message : String(statErr)}`;
+    console.warn('[scaffold] retention:', msg);
+    return { domain: 'retention', ok: false, error: msg };
+  }
 
   if (await pathsOverlap(sourceFile, targetFile)) {
     console.warn(`[scaffold] retention: source and target overlap; skipping`);
-    return;
+    return { domain: 'retention', ok: true };
   }
 
+  let targetStatErr: unknown;
   const targetExists = await stat(targetFile)
     .then(() => true)
-    .catch(() => false);
+    .catch((err) => {
+      targetStatErr = err;
+      return false;
+    });
+  if (!targetExists && errnoCode(targetStatErr) !== 'ENOENT' && targetStatErr) {
+    console.warn(
+      `[scaffold] retention: stat ${targetFile} failed:`,
+      targetStatErr,
+    );
+  }
   if (targetExists && !override) {
     console.warn(
       `[scaffold] retention: target ${targetFile} exists, skipping (use override:true to reseed)`,
     );
-    return;
+    return { domain: 'retention', ok: true };
   }
 
   try {
     const buf = await readFile(sourceFile);
     await atomicWrite(targetFile, buf.toString('utf-8'));
+    return { domain: 'retention', ok: true };
   } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
     console.error(
       `[scaffold] retention: copy failed for org "${orgSlug}":`,
-      err instanceof Error ? err.message : err,
+      message,
     );
+    return { domain: 'retention', ok: false, error: message };
+  }
+}
+
+/**
+ * Best-effort opportunistic sweep of `.deleted-*` siblings older than
+ * 24h that survived a prior failed `rm`. Called at the top of
+ * `cleanupOrgFilesystem`. Errors are swallowed (the main op shouldn't
+ * fail because of a leftover dir we couldn't clean).
+ */
+const CONDEMNED_TTL_MS = 24 * 60 * 60 * 1000;
+async function sweepStaleCondemnedDirs(root: string): Promise<void> {
+  let entries: string[];
+  try {
+    entries = await readdir(root);
+  } catch (err) {
+    if (errnoCode(err) === 'ENOENT') return;
+    throw err;
+  }
+  const now = Date.now();
+  for (const name of entries) {
+    if (!name.startsWith('.deleted-')) continue;
+    const p = path.join(root, name);
+    const info = await lstat(p).catch(() => null);
+    if (!info || info.isSymbolicLink()) continue;
+    if (now - info.mtimeMs < CONDEMNED_TTL_MS) continue;
+    await rm(p, { recursive: true }).catch((err) => {
+      console.warn(
+        `[cleanupOrgFilesystem] janitor: rm ${p} failed:`,
+        err instanceof Error ? err.message : err,
+      );
+    });
   }
 }
 
@@ -412,6 +516,12 @@ export const cleanupOrgFilesystem = internalAction({
       return null;
     }
 
+    // Opportunistic janitor: sweep stale `.deleted-*` siblings older than
+    // 24h that survived a prior failed rm. Best-effort; failures only log.
+    await sweepStaleCondemnedDirs(root).catch((err) => {
+      console.warn('[cleanupOrgFilesystem] janitor sweep failed:', err);
+    });
+
     if (args.orgSlug === 'default') {
       console.warn(
         '[cleanupOrgFilesystem] refusing to delete the default org filesystem',
@@ -466,8 +576,12 @@ export const cleanupOrgFilesystem = internalAction({
 
     // Two-phase rename-then-delete. The rename is atomic within a
     // filesystem; any concurrent writer of the original path fails with
-    // ENOENT instead of racing the recursive delete.
-    const condemned = path.join(root, `.deleted-${args.orgSlug}-${Date.now()}`);
+    // ENOENT instead of racing the recursive delete. UUID suffix avoids
+    // collisions if two cleanups land in the same millisecond.
+    const condemned = path.join(
+      root,
+      `.deleted-${args.orgSlug}-${Date.now()}-${randomUUID().slice(0, 8)}`,
+    );
     try {
       await rename(orgDir, condemned);
     } catch (err) {
@@ -502,24 +616,78 @@ export const scaffoldNewOrganization = internalAction({
      * files (idempotent org-create path).
      */
     override: v.optional(v.boolean()),
+    /**
+     * When true, throw an aggregated error if any domain or retention
+     * copy failed. Used by `reseedAllOrgsFromBuiltin` so partial failures
+     * surface as non-zero CLI exit.
+     *
+     * When false (default), continue past per-domain failures and return
+     * the per-domain result map. Used by `auth.afterCreateOrganization`
+     * where partial-scaffold-on-org-create is preferable to blocking the
+     * UX.
+     */
+    strict: v.optional(v.boolean()),
   },
-  returns: v.null(),
+  returns: v.object({
+    ok: v.boolean(),
+    skipped: v.boolean(),
+    results: v.array(
+      v.object({
+        domain: v.string(),
+        ok: v.boolean(),
+        error: v.optional(v.string()),
+      }),
+    ),
+  }),
   handler: async (_ctx, args) => {
     if (!validateOrgSlug(args.orgSlug)) {
       console.warn(
         `[scaffoldNewOrganization] refusing invalid slug "${args.orgSlug}"`,
       );
-      return null;
+      return { ok: false, skipped: true, results: [] };
+    }
+
+    // Symmetric guard to cleanupOrgFilesystem: refuse to operate on a
+    // non-absolute or unset config root rather than writing relative
+    // paths into the action's CWD.
+    const configRoot = process.env.TALE_CONFIG_DIR;
+    if (!configRoot || !path.isAbsolute(configRoot)) {
+      const msg =
+        '[scaffoldNewOrganization] TALE_CONFIG_DIR is unset or not absolute; refusing to proceed';
+      console.error(msg);
+      if (args.strict) {
+        throw new Error(msg);
+      }
+      return { ok: false, skipped: true, results: [] };
     }
 
     const catalogRoot = process.env[BUILTIN_ENV];
     const override = args.override ?? false;
 
+    const results: DomainResult[] = [];
     for (const domain of DOMAINS) {
-      await seedDomain(domain, catalogRoot, args.orgSlug, override);
+      results.push(
+        await seedDomain(domain, catalogRoot, args.orgSlug, override),
+      );
     }
-    await seedRetention(catalogRoot, args.orgSlug, override);
+    results.push(
+      await seedRetention(catalogRoot, configRoot, args.orgSlug, override),
+    );
 
-    return null;
+    const failed = results.filter((r) => !r.ok);
+    if (failed.length > 0 && args.strict) {
+      const detail = failed
+        .map((r) => `${r.domain}: ${r.error ?? 'unknown error'}`)
+        .join('; ');
+      throw new Error(
+        `scaffold "${args.orgSlug}": ${failed.length}/${results.length} domains failed — ${detail}`,
+      );
+    }
+
+    return {
+      ok: failed.length === 0,
+      skipped: false,
+      results,
+    };
   },
 });
diff --git a/services/platform/convex/skills/file_actions.ts b/services/platform/convex/skills/file_actions.ts
index a46b693269..fba67e2ef0 100644
--- a/services/platform/convex/skills/file_actions.ts
+++ b/services/platform/convex/skills/file_actions.ts
@@ -5,7 +5,7 @@
  * for the runtime engine's snapshot read).
  *
  * Storage model mirrors agents/integrations: SKILL.md + bundle assets on
- * disk under `${SKILLS_DIR}/<orgSlug-prefix>/<slug>/`. There is NO Convex
+ * disk under `${TALE_CONFIG_DIR}/<orgSlug>/skills/<slug>/`. There is NO Convex
  * DB table for skills — the file is the source of truth, team scoping and
  * role restriction live in YAML frontmatter, author/timestamps come from
  * audit_logs (see Phase 5c follow-up).
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index e13499a176..9c49f699e8 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -17,6 +17,7 @@ import { fetchDocumentComparisonByUrls } from '../../../agent_tools/documents/he
 import { fetchDocumentContent } from '../../../agent_tools/documents/helpers/fetch_document_content';
 import { getDocumentEffectiveDate } from '../../../documents/transform_to_document_item';
 import type { DocumentMetadata } from '../../../documents/types';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { toConvexJsonRecord, toId } from '../../../lib/type_cast_helpers';
 import { wrapUntrusted } from '../../../lib/untrusted_content';
 import { jsonRecordValidator } from '../../../lib/validators/json';
@@ -384,11 +385,13 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
                 resolveFileName(ctx, params.comparisonFileId),
               ]);
 
+        const compareOrgSlug = await orgSlugFromId(ctx, organizationId);
         return await fetchDocumentComparisonByUrls(
           baseFileUrl,
           baseFileName,
           compFileUrl,
           compFileName,
+          compareOrgSlug,
           params.maxChanges,
         );
       }
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.test.ts
index 3cf2328a6e..33bbc326d8 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.test.ts
@@ -16,17 +16,26 @@ const uploadFileMock = vi.mocked(uploadFile);
 const DEFAULT_METADATA = {
   fileName: 'document.pdf',
   contentType: 'application/pdf',
+  organizationId: 'org-1',
 };
 
+const DEFAULT_ORG_ROW = { _id: 'org-1', slug: 'default' };
+
 function createCtx(
   getUrlResult: string | null = 'https://storage.example.com/file',
   metadataResult: Record<string, unknown> | null = DEFAULT_METADATA,
 ) {
+  // uploadDocument issues two runQuery calls in order:
+  //   1. internal.file_metadata.internal_queries.getByStorageId
+  //   2. components.betterAuth.adapter.findOne (via orgSlugFromId)
   return {
     storage: {
       getUrl: vi.fn().mockResolvedValue(getUrlResult),
     },
-    runQuery: vi.fn().mockResolvedValue(metadataResult),
+    runQuery: vi
+      .fn()
+      .mockResolvedValueOnce(metadataResult)
+      .mockResolvedValueOnce(DEFAULT_ORG_ROW),
   };
 }
 
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.ts
index 01ae1e4726..8cb1fdf1d5 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_document.ts
@@ -4,6 +4,7 @@ import {
 } from '../../../../../lib/shared/file-types';
 import { internal } from '../../../../_generated/api';
 import type { ActionCtx } from '../../../../_generated/server';
+import { orgSlugFromId } from '../../../../lib/helpers/org_slug';
 import { toId } from '../../../../lib/type_cast_helpers';
 import type { RagUploadResult } from './types';
 import { uploadFile } from './upload_file_direct';
@@ -67,6 +68,8 @@ export async function uploadDocument(
     contentType,
   );
 
+  const orgSlug = await orgSlugFromId(ctx, metadata.organizationId);
+
   return uploadFile({
     file,
     filename: fileName,
@@ -74,5 +77,6 @@ export async function uploadDocument(
     fileId,
     metadata: options?.metadata,
     sync: options?.sync ?? false,
+    orgSlug,
   });
 }
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
index f382ef1cf1..6124589290 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
@@ -26,6 +26,7 @@ function defaultArgs() {
     filename: 'test.txt',
     contentType: 'text/plain',
     fileId: FILE_ID,
+    orgSlug: 'default',
   };
 }
 
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
index 8bfd5b386c..d3d0b5b62c 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
@@ -9,6 +9,8 @@ export interface UploadFileArgs {
   metadata?: Record<string, unknown>;
   timeoutMs?: number;
   sync?: boolean;
+  /** Required: RAG resolves the org's provider catalog from this slug. */
+  orgSlug: string;
 }
 
 interface RagApiUploadResponse {
@@ -32,6 +34,7 @@ export async function uploadFile({
   metadata,
   timeoutMs,
   sync = false,
+  orgSlug,
 }: UploadFileArgs): Promise<RagUploadResult> {
   const effectiveTimeout =
     timeoutMs ?? (sync ? SYNC_TIMEOUT_MS : DEFAULT_TIMEOUT_MS);
@@ -53,6 +56,7 @@ export async function uploadFile({
     method: 'POST',
     body: formData,
     timeoutMs: effectiveTimeout,
+    orgSlug,
   });
 
   if (!response.ok) {
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index 021ff255d3..7a41435aa7 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -5,6 +5,7 @@ import { internal } from '../../../_generated/api';
 import type { ActionCtx } from '../../../_generated/server';
 import type { SearchResponse } from '../../../agent_tools/rag/format_search_results';
 import { fetchDocumentChunks } from '../../../agent_tools/rag/helpers/fetch_document_chunks';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { ragFetch } from '../../../lib/helpers/rag_config';
 import { toId } from '../../../lib/type_cast_helpers';
 import { wrapUntrusted } from '../../../lib/untrusted_content';
@@ -100,7 +101,12 @@ export const ragAction: ActionDefinition<RagActionParams> = {
         // fileIds must be verified against the workflow's organizationId
         // before reaching the RAG service, which would otherwise serve
         // any file by id regardless of tenant.
-        await assertStorageIdsInOrg(ctx, _variables, migratedParams.fileIds);
+        const orgId = await assertStorageIdsInOrg(
+          ctx,
+          _variables,
+          migratedParams.fileIds,
+        );
+        const orgSlug = await orgSlugFromId(ctx, orgId);
         try {
           const response = await ragFetch('/api/v1/search', {
             method: 'POST',
@@ -113,6 +119,7 @@ export const ragAction: ActionDefinition<RagActionParams> = {
               include_metadata: true,
             }),
             timeoutMs: SEARCH_TIMEOUT_MS,
+            orgSlug,
           });
 
           if (!response.ok) {
@@ -195,7 +202,7 @@ async function assertStorageIdsInOrg(
   ctx: ActionCtx,
   variables: Record<string, unknown>,
   storageIds: string[],
-): Promise<void> {
+): Promise<string> {
   const organizationId =
     typeof variables.organizationId === 'string'
       ? variables.organizationId
@@ -205,7 +212,7 @@ async function assertStorageIdsInOrg(
       'organizationId is required in workflow variables for RAG operations',
     );
   }
-  if (storageIds.length === 0) return;
+  if (storageIds.length === 0) return organizationId;
   const ownsStorage = await ctx.runQuery(
     internal.documents.internal_queries.verifyStorageIdsBelongToOrg,
     { organizationId, storageIds },
@@ -213,6 +220,7 @@ async function assertStorageIdsInOrg(
   if (!ownsStorage) {
     throw new Error('One or more file ids do not belong to this organization');
   }
+  return organizationId;
 }
 
 /**
diff --git a/services/platform/docker-entrypoint.sh b/services/platform/docker-entrypoint.sh
index 0ac3c78b6d..230bfcf1b3 100644
--- a/services/platform/docker-entrypoint.sh
+++ b/services/platform/docker-entrypoint.sh
@@ -243,9 +243,15 @@ deploy_convex_functions() {
   )
   for legacy in "${LEGACY_DOMAIN_VARS[@]}"; do
     if [ "${CONVEX_ENV_MAP[$legacy]+_}" ]; then
-      if bunx convex env remove "$legacy" --url "$CONVEX_URL" --admin-key "$ADMIN_KEY" >/dev/null 2>&1; then
+      # Match the surrounding env-sync loop's aggregation pattern: track
+      # failures in `failed_vars` later, never swallow with `>/dev/null`
+      # so a real CLI error doesn't leave the legacy var lingering in
+      # Convex without an operator-visible signal.
+      if bunx convex env remove "$legacy" --url "$CONVEX_URL" --admin-key "$ADMIN_KEY" >/dev/null; then
         echo "   ✓ $legacy removed (no longer honored under org-first layout)"
         unset 'CONVEX_ENV_MAP[$legacy]'
+      else
+        log_warn "Failed to remove legacy env var $legacy from Convex; will retry on next boot"
       fi
     fi
   done
@@ -299,7 +305,7 @@ deploy_convex_functions() {
 
   # 5b. Remove vars from Convex that are no longer set on the platform.
   # Without this, env vars unset on the platform side linger in Convex.
-  # Skip orphans we already removed above (in the ORPHAN_DERIVED block).
+  # Skip orphans we already removed above (in the LEGACY_DOMAIN_VARS block).
   for convex_var in "${!CONVEX_ENV_MAP[@]}"; do
     local found=false
     local sv
diff --git a/services/platform/vite-plugins/serve-branding-images.ts b/services/platform/vite-plugins/serve-branding-images.ts
index e05fe95e4d..bbc6c7d9a7 100644
--- a/services/platform/vite-plugins/serve-branding-images.ts
+++ b/services/platform/vite-plugins/serve-branding-images.ts
@@ -52,7 +52,24 @@ export function serveBrandingImages(): Plugin {
             res.setHeader('Cache-Control', 'no-cache, must-revalidate');
             res.end(data);
           })
-          .catch(() => {
+          .catch((err: unknown) => {
+            // ENOENT is the expected miss — fall through to the next
+            // middleware so Vite's static handler / 404 page kicks in.
+            // Other errors (EACCES, EISDIR) are worth a warning so a
+            // misconfigured branding dir doesn't silently 404 forever.
+            const code =
+              err !== null &&
+              typeof err === 'object' &&
+              'code' in err &&
+              typeof err.code === 'string'
+                ? err.code
+                : undefined;
+            if (code !== 'ENOENT') {
+              console.warn(
+                `[serve-branding-images] readFile ${filePath} failed:`,
+                err,
+              );
+            }
             next();
           });
       });
diff --git a/services/rag/app/auth.py b/services/rag/app/auth.py
index 8f5dd58818..dc46007a46 100644
--- a/services/rag/app/auth.py
+++ b/services/rag/app/auth.py
@@ -12,12 +12,18 @@
 """
 
 import hmac
+import re
 
 from fastapi import Header, HTTPException, status
 from loguru import logger
 
 from .config import settings
 
+# Org-slug regex aligned with services/platform/convex/lib/file_io.ts:25
+# plus the literal "default". Capped at 64 chars to match the platform's
+# migrate-script regex (script.sh:134). Keep these in sync.
+_ORG_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
+
 
 def _extract_bearer(header_value: str | None) -> str | None:
     if not header_value:
@@ -53,6 +59,29 @@ async def verify_auth_token(
         )
 
 
+async def require_org_slug(
+    x_tale_org: str | None = Header(default=None),
+) -> str:
+    """FastAPI dependency: extract + validate the `X-Tale-Org` header.
+
+    Every protected RAG endpoint requires this header. Caller-supplied;
+    the platform sets it from the authenticated user's selected org.
+    No fallback to `default` — a missing header is a caller bug that we
+    surface as 400 rather than silently serve another org's providers.
+    """
+    if not x_tale_org:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="missing X-Tale-Org header",
+        )
+    if not _ORG_SLUG_RE.match(x_tale_org):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="invalid X-Tale-Org header",
+        )
+    return x_tale_org
+
+
 def warn_if_auth_disabled() -> None:
     """Emit a loud SECURITY warning when `RAG_AUTH_TOKEN` is unset.
 
diff --git a/services/rag/app/config.py b/services/rag/app/config.py
index ec3a9504fa..9d081e894e 100644
--- a/services/rag/app/config.py
+++ b/services/rag/app/config.py
@@ -75,10 +75,10 @@ def get_database_url(self) -> str:
             return self.database_url
         raise ValueError("RAG_DATABASE_URL must be set in environment")
 
-    def get_llm_config(self) -> dict:
-        """Get LLM configuration from provider files."""
-        base_url, api_key, model = self.get_chat_config()
-        emb_base_url, emb_api_key, embedding_model, _dims = self.get_embedding_config()
+    def get_llm_config(self, org_slug: str) -> dict:
+        """Get LLM configuration for an org from provider files."""
+        base_url, api_key, model = self.get_chat_config(org_slug)
+        emb_base_url, emb_api_key, embedding_model, _dims = self.get_embedding_config(org_slug)
 
         config: dict = {
             "provider": "openai",
diff --git a/services/rag/app/routers/documents.py b/services/rag/app/routers/documents.py
index bc77df3de2..a59b791ef1 100644
--- a/services/rag/app/routers/documents.py
+++ b/services/rag/app/routers/documents.py
@@ -6,11 +6,21 @@
 from typing import Any
 from uuid import uuid4
 
-from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile, status
+from fastapi import (
+    APIRouter,
+    Depends,
+    File,
+    Form,
+    HTTPException,
+    Query,
+    UploadFile,
+    status,
+)
 from fastapi.background import BackgroundTasks
 from loguru import logger
 from tale_shared.db import acquire_with_retry
 
+from ..auth import require_org_slug
 from ..config import settings
 from ..models import (
     DocumentAddResponse,
@@ -195,6 +205,7 @@ def _sanitize_error(exc: Exception, max_length: int = 500) -> str:
 
 
 async def _background_ingest(
+    org_slug: str,
     content: bytes,
     file_id: str,
     filename: str,
@@ -204,6 +215,7 @@ async def _background_ingest(
     """Run document ingestion in the background, recording status in documents table."""
     try:
         result = await rag_service.add_document(
+            org_slug,
             content=content,
             file_id=file_id,
             filename=filename,
@@ -325,6 +337,7 @@ def _ms_timestamp_to_datetime(value: Any) -> dt.datetime | None:
 @router.post("/documents/upload", response_model=DocumentAddResponse)
 async def upload_document(
     background_tasks: BackgroundTasks,
+    org_slug: str = Depends(require_org_slug),
     file: UploadFile = _FILE_UPLOAD,
     metadata: str | None = Form(None, description="Optional metadata as JSON string"),
     file_id: str | None = Form(None, description="Optional custom file ID"),
@@ -370,6 +383,7 @@ async def upload_document(
         if sync:
             try:
                 result = await rag_service.add_document(
+                    org_slug,
                     content=file_bytes,
                     file_id=doc_id,
                     filename=file.filename,
@@ -397,6 +411,7 @@ async def upload_document(
 
         background_tasks.add_task(
             _background_ingest,
+            org_slug,
             file_bytes,
             doc_id,
             file.filename,
@@ -530,6 +545,7 @@ async def compare_documents(request: DocumentCompareRequest):
 
 @router.post("/documents/compare-files", response_model=DocumentCompareResponse)
 async def compare_files(
+    org_slug: str = Depends(require_org_slug),
     base_file: UploadFile = _BASE_FILE,
     comparison_file: UploadFile = _COMPARISON_FILE,
     max_changes: int = _MAX_CHANGES_FORM,
@@ -551,6 +567,7 @@ async def compare_files(
 
     try:
         result = await rag_service.compare_files(
+            org_slug,
             base_bytes,
             base_file.filename,
             comparison_bytes,
diff --git a/services/rag/app/routers/search.py b/services/rag/app/routers/search.py
index 1993199163..4a99031154 100644
--- a/services/rag/app/routers/search.py
+++ b/services/rag/app/routers/search.py
@@ -2,9 +2,10 @@
 
 import time
 
-from fastapi import APIRouter, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, status
 from loguru import logger
 
+from ..auth import require_org_slug
 from ..models import (
     GenerateRequest,
     GenerateResponse,
@@ -19,12 +20,16 @@
 
 
 @router.post("/search", response_model=QueryResponse)
-async def search(request: QueryRequest):
+async def search(
+    request: QueryRequest,
+    org_slug: str = Depends(require_org_slug),
+):
     """Search the knowledge base using hybrid BM25 + vector search."""
     try:
         start_time = time.time()
 
         results = await rag_service.search(
+            org_slug,
             query=request.query,
             top_k=request.top_k,
             similarity_threshold=request.similarity_threshold,
@@ -73,7 +78,10 @@ async def search(request: QueryRequest):
 
 
 @router.post("/generate", response_model=GenerateResponse)
-async def generate(request: GenerateRequest):
+async def generate(
+    request: GenerateRequest,
+    org_slug: str = Depends(require_org_slug),
+):
     """Generate a response using RAG.
 
     Retrieves top 30 most relevant chunks, uses temperature 0.3
@@ -81,6 +89,7 @@ async def generate(request: GenerateRequest):
     """
     try:
         result = await rag_service.generate(
+            org_slug,
             query=request.query,
             file_ids=request.file_ids,
         )
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index ca93ba275c..a804e468f8 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -2,6 +2,18 @@
 
 Provides: add_document, search, generate, delete_document.
 All operations use the private_knowledge schema in tale_knowledge database.
+
+Multi-org: each public method requires an `org_slug` so the LLM /
+embedding / vision clients used for the call come from THAT org's
+provider catalog at `<TALE_CONFIG_DIR>/<org>/providers/`. Per-org client
+state is built lazily and cached for `_CONFIG_CHECK_INTERVAL` seconds.
+
+Embedding **dimensions** are global: the underlying knowledge DB uses
+one vector column, so all orgs sharing this RAG instance must use the
+same embedding dimensions. The first org to initialize pins the value;
+subsequent orgs that disagree raise loudly rather than silently storing
+mis-dimensioned vectors. (Per-org dims would require per-org DB schemas
+— out of scope.)
 """
 
 from __future__ import annotations
@@ -9,6 +21,7 @@
 import asyncio
 import datetime as dt
 import time
+from dataclasses import dataclass
 from typing import Any
 
 import asyncpg
@@ -60,21 +73,46 @@ async def _safe_close(coro) -> None:
         logger.warning("Failed to close old client", exc_info=True)
 
 
+@dataclass
+class _OrgClients:
+    """Per-org cached LLM/embedding/vision clients.
+
+    Lifecycle: built lazily on first call for an org, refreshed if older
+    than `_CONFIG_CHECK_INTERVAL` AND the underlying provider config has
+    changed on disk.
+    """
+
+    llm_config: dict
+    vision_config: tuple | None
+    embedding_service: EmbeddingService
+    openai_client: AsyncOpenAI
+    vision_client: VisionClient | None
+    search_service: RagSearchService
+    last_check: float
+
+
 class RagService:
     def __init__(self) -> None:
         self.initialized = False
         self._init_lock = asyncio.Lock()
         self._pool: asyncpg.Pool | None = None
-        self._embedding_service: EmbeddingService | None = None
-        self._vision_client: VisionClient | None = None
-        self._openai_client: AsyncOpenAI | None = None
-        self._search_service: RagSearchService | None = None
-        self._llm_config: dict | None = None
-        self._vision_config: tuple | None = None
-        self._last_config_check: float = 0
+        # Embedding dimensions are pinned globally; see module docstring.
+        self._pinned_dims: int | None = None
+        # Per-org client cache and per-org locks (so concurrent first-calls
+        # for the same org don't both build clients).
+        self._org_clients: dict[str, _OrgClients] = {}
+        self._org_locks: dict[str, asyncio.Lock] = {}
+        # Per-search-call usage propagation — set by search(), read by
+        # generate(). Single-threaded asyncio so no need for per-org isolation.
+        self.last_search_usage: Any = None
 
     async def initialize(self) -> None:
-        """Initialize database pool, embedding service, vision client, and LLM client."""
+        """Initialize the shared database pool.
+
+        Per-org client construction is deferred until the first call for
+        that org. The DB pool is global — all orgs share one
+        knowledge-DB connection pool because the schema is global.
+        """
         if self.initialized:
             return
 
@@ -82,138 +120,113 @@ async def initialize(self) -> None:
             if self.initialized:
                 return
 
-            await self._do_initialize()
+            self._pool = await init_pool()
+            self.initialized = True
+            logger.info("RagService initialized (DB pool ready; per-org clients lazy)")
 
-    async def _do_initialize(self) -> None:
+    @property
+    def embedding_service(self) -> EmbeddingService | None:
+        """Deprecated: kept for any callers that haven't been threaded
+        with `org_slug` yet. Returns None; callers must migrate.
+        """
+        return None
 
-        # Database pool
-        self._pool = await init_pool()
+    def _get_org_lock(self, org_slug: str) -> asyncio.Lock:
+        lock = self._org_locks.get(org_slug)
+        if lock is None:
+            lock = asyncio.Lock()
+            self._org_locks[org_slug] = lock
+        return lock
 
-        # Embedding service
-        llm_config = settings.get_llm_config()
-        embedding_model = llm_config["embedding_model"]
-        dimensions = settings.get_embedding_dimensions()
+    async def _ensure_org_clients(self, org_slug: str) -> _OrgClients:
+        """Lazy-init or refresh an org's clients.
 
-        self._embedding_service = EmbeddingService(
-            api_key=llm_config["embedding_api_key"],
-            base_url=llm_config["embedding_base_url"],
-            model=embedding_model,
-            dimensions=dimensions,
-        )
-        self._llm_config = llm_config
+        Refresh is gated on `_CONFIG_CHECK_INTERVAL` so a busy org doesn't
+        re-read its provider files on every call.
+        """
+        if not self.initialized:
+            await self.initialize()
+        if self._pool is None:
+            raise RuntimeError("RagService not initialized: database pool is None")
 
-        # Pin embedding dimensions and create HNSW index (runtime config, not a migration)
-        await pin_embedding_dimensions(self._pool, dimensions)
+        cached = self._org_clients.get(org_slug)
+        if cached is not None:
+            now = time.monotonic()
+            if (now - cached.last_check) < _CONFIG_CHECK_INTERVAL:
+                return cached
 
-        # Vision client (optional — only if model is configured)
-        try:
-            vision_config = settings.get_vision_config()
-            v_base_url, v_api_key, v_model = vision_config
-            self._vision_client = VisionClient(
-                api_key=v_api_key,
-                model=v_model,
-                base_url=v_base_url,
-                timeout=120.0,
-                request_timeout=float(settings.vision_request_timeout),
-                max_concurrent_pages=settings.vision_max_concurrent_pages,
-                pdf_dpi=settings.vision_pdf_dpi,
-                ocr_prompt=settings.vision_extraction_prompt,
+        lock = self._get_org_lock(org_slug)
+        async with lock:
+            cached = self._org_clients.get(org_slug)
+            if cached is not None:
+                now = time.monotonic()
+                if (now - cached.last_check) < _CONFIG_CHECK_INTERVAL:
+                    return cached
+
+            return await self._build_or_refresh_org_clients(org_slug, cached)
+
+    async def _build_or_refresh_org_clients(
+        self,
+        org_slug: str,
+        previous: _OrgClients | None,
+    ) -> _OrgClients:
+        """Construct fresh clients for org_slug, atomic-swapping if existing."""
+        assert self._pool is not None
+
+        llm_config = settings.get_llm_config(org_slug)
+        if previous is not None and llm_config == previous.llm_config:
+            # No change — refresh the timestamp and reuse.
+            previous.last_check = time.monotonic()
+            return previous
+
+        if not llm_config.get("api_key") or not llm_config.get("embedding_api_key"):
+            if previous is not None:
+                logger.warning(
+                    "Skipping LLM config reload for org '{}': empty API key",
+                    org_slug,
+                )
+                previous.last_check = time.monotonic()
+                return previous
+            raise ValueError(f"Org '{org_slug}' has empty chat or embedding API key in provider config.")
+
+        _b, _a, _m, dims = settings.get_embedding_config(org_slug)
+
+        if self._pinned_dims is None:
+            self._pinned_dims = dims
+            await pin_embedding_dimensions(self._pool, dims)
+            logger.info(
+                "Pinned RAG embedding dimensions to {} (set by org '{}')",
+                dims,
+                org_slug,
             )
-            self._vision_config = vision_config
-            logger.info("Vision client initialized with model: {}", v_model)
-        except ValueError:
-            logger.info("No vision model configured, Vision features disabled")
-            self._vision_client = None
-
-        # OpenAI client for generation. Explicit timeout: the SDK
-        # default is 600 s, which can hold the asyncio event loop for
-        # 10 minutes on a stuck provider endpoint and starve the DB
-        # pool. Round-2 review MEDIUM (E.4.7).
-        self._openai_client = AsyncOpenAI(
+        elif dims != self._pinned_dims:
+            raise ValueError(
+                f"Org '{org_slug}' embedding dimensions ({dims}) do not match the "
+                f"pinned RAG schema dimensions ({self._pinned_dims}). All orgs "
+                f"sharing this RAG instance must use the same embedding model "
+                f"dimensions. Reconcile provider configs or run RAG per-org."
+            )
+
+        embedding_service = EmbeddingService(
+            api_key=llm_config["embedding_api_key"],
+            base_url=llm_config["embedding_base_url"],
+            model=llm_config["embedding_model"],
+            dimensions=dims,
+        )
+        openai_client = AsyncOpenAI(
             api_key=llm_config["api_key"],
             base_url=llm_config["base_url"],
             timeout=httpx.Timeout(connect=10.0, read=120.0, write=30.0, pool=5.0),
         )
 
-        # Search service
-        self._search_service = RagSearchService(self._pool, self._embedding_service)
-
-        self._last_config_check = time.monotonic()
-        self.initialized = True
-        logger.info("RagService initialized")
-
-    @property
-    def embedding_service(self) -> EmbeddingService | None:
-        return self._embedding_service
-
-    def _maybe_refresh_clients(self) -> None:
-        """Check provider config freshness; rebuild clients if changed.
-
-        This method is synchronous (no await) so that all attribute swaps
-        happen atomically from asyncio's cooperative-scheduling perspective.
-        """
-        if not self.initialized:
-            return
-        now = time.monotonic()
-        if (now - self._last_config_check) < _CONFIG_CHECK_INTERVAL:
-            return
-        self._last_config_check = now
-
-        # Check chat/embedding config
-        new_llm_config = settings.get_llm_config()
-        if new_llm_config != self._llm_config:
-            if not new_llm_config.get("api_key") or not new_llm_config.get("embedding_api_key"):
-                logger.warning("Skipping LLM config reload: empty API key")
-            else:
-                new_dims = settings.get_embedding_dimensions()
-                if self._embedding_service and new_dims != self._embedding_service.dimensions:
-                    logger.error(
-                        "Embedding dimensions changed ({} -> {}). Restart required.",
-                        self._embedding_service.dimensions,
-                        new_dims,
-                    )
-                else:
-                    # Prepare new clients before swapping any state
-                    new_emb = EmbeddingService(
-                        api_key=new_llm_config["embedding_api_key"],
-                        base_url=new_llm_config["embedding_base_url"],
-                        model=new_llm_config["embedding_model"],
-                        dimensions=new_dims,
-                    )
-                    new_oai = AsyncOpenAI(
-                        api_key=new_llm_config["api_key"],
-                        base_url=new_llm_config["base_url"],
-                        timeout=httpx.Timeout(connect=10.0, read=120.0, write=30.0, pool=5.0),
-                    )
-
-                    # Swap all at once (atomic from asyncio's cooperative perspective)
-                    old_emb = self._embedding_service
-                    old_oai = self._openai_client
-                    self._embedding_service = new_emb
-                    self._openai_client = new_oai
-                    if self._pool:
-                        self._search_service = RagSearchService(self._pool, new_emb)
-                    self._llm_config = new_llm_config
-                    logger.info("RAG LLM clients refreshed: model={}", new_llm_config.get("embedding_model"))
-
-                    # Close old clients (fire-and-forget with grace period)
-                    loop = asyncio.get_running_loop()
-                    if old_emb:
-                        task = loop.create_task(_safe_close(old_emb.close()))
-                        _background_tasks.add(task)
-                        task.add_done_callback(_background_tasks.discard)
-                    if old_oai:
-                        task = loop.create_task(_safe_close(old_oai.close()))
-                        _background_tasks.add(task)
-                        task.add_done_callback(_background_tasks.discard)
-
-        # Check vision config
+        # Vision client (optional — only if the org has a vision-tagged model)
+        vision_client: VisionClient | None = None
+        vision_config: tuple | None = None
         try:
-            new_vision_config = settings.get_vision_config()
-            v_base_url, v_api_key, v_model = new_vision_config
-            if new_vision_config != self._vision_config and v_api_key:
-                old_vision = self._vision_client
-                self._vision_client = VisionClient(
+            vision_config = settings.get_vision_config(org_slug)
+            v_base_url, v_api_key, v_model = vision_config
+            if v_api_key:
+                vision_client = VisionClient(
                     api_key=v_api_key,
                     model=v_model,
                     base_url=v_base_url,
@@ -223,18 +236,58 @@ def _maybe_refresh_clients(self) -> None:
                     pdf_dpi=settings.vision_pdf_dpi,
                     ocr_prompt=settings.vision_extraction_prompt,
                 )
-                self._vision_config = new_vision_config
-                logger.info("RAG vision client refreshed: model={}", v_model)
-                if old_vision:
-                    loop = asyncio.get_running_loop()
-                    task = loop.create_task(_safe_close(old_vision.close()))
-                    _background_tasks.add(task)
-                    task.add_done_callback(_background_tasks.discard)
+                logger.info(
+                    "Vision client initialized for org '{}' with model {}",
+                    org_slug,
+                    v_model,
+                )
         except ValueError:
-            logger.debug("No vision model in provider config, skipping vision refresh")
+            logger.debug(
+                "No vision model configured for org '{}', Vision disabled",
+                org_slug,
+            )
+
+        search_service = RagSearchService(self._pool, embedding_service)
+
+        new_clients = _OrgClients(
+            llm_config=llm_config,
+            vision_config=vision_config,
+            embedding_service=embedding_service,
+            openai_client=openai_client,
+            vision_client=vision_client,
+            search_service=search_service,
+            last_check=time.monotonic(),
+        )
+        self._org_clients[org_slug] = new_clients
+
+        # Best-effort close of old clients after a grace period so in-flight
+        # requests on the old clients finish cleanly.
+        if previous is not None:
+            loop = asyncio.get_running_loop()
+            if previous.embedding_service is not embedding_service:
+                task = loop.create_task(_safe_close(previous.embedding_service.close()))
+                _background_tasks.add(task)
+                task.add_done_callback(_background_tasks.discard)
+            if previous.openai_client is not openai_client:
+                task = loop.create_task(_safe_close(previous.openai_client.close()))
+                _background_tasks.add(task)
+                task.add_done_callback(_background_tasks.discard)
+            if previous.vision_client is not None and previous.vision_client is not vision_client:
+                task = loop.create_task(_safe_close(previous.vision_client.close()))
+                _background_tasks.add(task)
+                task.add_done_callback(_background_tasks.discard)
+
+        logger.info(
+            "RAG clients {} for org '{}': model={}",
+            "refreshed" if previous else "initialized",
+            org_slug,
+            llm_config.get("model"),
+        )
+        return new_clients
 
     async def add_document(
         self,
+        org_slug: str,
         content: bytes,
         file_id: str,
         filename: str,
@@ -242,23 +295,19 @@ async def add_document(
         source_created_at: dt.datetime | None = None,
         source_modified_at: dt.datetime | None = None,
     ) -> dict[str, Any]:
-        """Add a document to the knowledge base."""
-        if not self.initialized:
-            await self.initialize()
-        self._maybe_refresh_clients()
+        """Add a document to the knowledge base for the given org."""
+        clients = await self._ensure_org_clients(org_slug)
 
         if self._pool is None:
             raise RuntimeError("RagService not initialized: database pool is None")
-        if self._embedding_service is None:
-            raise RuntimeError("RagService not initialized: embedding service is None")
 
         return await index_document(
             self._pool,
             file_id,
             content,
             filename,
-            embedding_service=self._embedding_service,
-            vision_client=self._vision_client,
+            embedding_service=clients.embedding_service,
+            vision_client=clients.vision_client,
             chunk_size=settings.chunk_size,
             chunk_overlap=settings.chunk_overlap,
             source_created_at=source_created_at,
@@ -267,6 +316,7 @@ async def add_document(
 
     async def search(
         self,
+        org_slug: str,
         query: str,
         *,
         top_k: int | None = None,
@@ -277,24 +327,19 @@ async def search(
 
         Embedding token usage available via `self.last_search_usage` after call.
         """
-        if not self.initialized:
-            await self.initialize()
-        self._maybe_refresh_clients()
-
-        if self._search_service is None:
-            raise RuntimeError("RagService not initialized: search service is None")
+        clients = await self._ensure_org_clients(org_slug)
 
         effective_top_k = top_k if top_k is not None else settings.top_k
         threshold = similarity_threshold if similarity_threshold is not None else settings.similarity_threshold
 
-        results = await self._search_service.search(
+        results = await clients.search_service.search(
             query,
             file_ids=file_ids,
             top_k=effective_top_k,
             similarity_threshold=threshold,
         )
 
-        self.last_search_usage = getattr(self._search_service, "last_search_usage", None)
+        self.last_search_usage = getattr(clients.search_service, "last_search_usage", None)
 
         # If no results and some files are still indexing, wait and retry once
         if not results and file_ids:
@@ -303,33 +348,29 @@ async def search(
             if has_processing:
                 logger.info("No results and some files still indexing, retrying in 3s")
                 await asyncio.sleep(3)
-                results = await self._search_service.search(
+                results = await clients.search_service.search(
                     query,
                     file_ids=file_ids,
                     top_k=effective_top_k,
                     similarity_threshold=threshold,
                 )
-                self.last_search_usage = getattr(self._search_service, "last_search_usage", None)
+                self.last_search_usage = getattr(clients.search_service, "last_search_usage", None)
 
         return results
 
     async def generate(
         self,
+        org_slug: str,
         query: str,
         file_ids: list[str] | None = None,
     ) -> dict[str, Any]:
         """Generate a response using RAG: search -> context assembly -> LLM."""
-        if not self.initialized:
-            await self.initialize()
-        self._maybe_refresh_clients()
-
-        if self._openai_client is None:
-            raise RuntimeError("RagService not initialized: OpenAI client is None")
+        clients = await self._ensure_org_clients(org_slug)
 
         try:
             start_time = time.time()
 
-            search_results = await self.search(query, top_k=RAG_TOP_K, file_ids=file_ids)
+            search_results = await self.search(org_slug, query, top_k=RAG_TOP_K, file_ids=file_ids)
 
             if not search_results:
                 return {
@@ -363,9 +404,9 @@ async def generate(
             context = "\n\n".join(context_parts)
             user_message = f"Context:\n{context}\n\nQuestion: {query}"
 
-            llm_config = settings.get_llm_config()
+            llm_config = clients.llm_config
 
-            completion = await self._openai_client.chat.completions.create(
+            completion = await clients.openai_client.chat.completions.create(
                 model=llm_config["model"],
                 messages=[
                     {"role": "system", "content": SYSTEM_PROMPT},
@@ -417,14 +458,9 @@ async def get_document_content(
     ) -> dict[str, Any] | None:
         """Retrieve document content by reassembling stored chunks.
 
-        Args:
-            file_id: Logical file identifier.
-            chunk_start: First chunk to return (1-indexed).
-            chunk_end: Last chunk to return (1-indexed, inclusive). None = capped by MAX_CHUNK_WINDOW.
-            return_chunks: If True, include individual chunks as a list.
-
-        Returns:
-            Response dict with content and metadata, or None if not found.
+        Does not require an org slug: documents are looked up by file_id
+        in the shared knowledge schema. Access control / tenancy is
+        enforced at the platform → RAG boundary.
         """
         if not self.initialized:
             await self.initialize()
@@ -474,20 +510,7 @@ async def get_document_content(
                 "source_modified_at": doc["source_modified_at"],
             }
 
-        # Reassembly: concatenate each chunk's forward-owning `core_content`
-        # span. By construction, "".join(core_content) equals the original
-        # ingested text (see tale_knowledge.chunking.splitter tests), so
-        # overlap regions between adjacent chunks appear exactly once —
-        # fixing the duplicate-content bug the old "\n\n".join(chunk_content)
-        # exhibited.
-        #
-        # Per-document reindex is atomic (see _do_store), so a document's
-        # chunks are either all migrated (core_content populated) or all
-        # legacy (core_content == ''). Mixed state within one document is
-        # not possible. Falling back to the old stitching for legacy docs
-        # preserves correctness (no lost text) with today's known
-        # duplicate-content behavior until reindex completes.
-        # The fallback + chunk_content column disappear in Phase 5.
+        # Reassembly: see chunking docs.
         all_migrated = all(row["core_content"] for row in rows)
         if all_migrated:
             combined = "".join(row["core_content"] for row in rows)
@@ -521,11 +544,7 @@ async def get_document_statuses(
         """Get statuses for multiple documents by file_id.
 
         Returns a dict mapping file_id to status info or None if not found.
-        When a document has multiple scope rows, priority is: processing > failed > completed.
-
-        If ANY scope row is still processing, the document is considered processing.
-        This ensures reindex operations are visible even when other scope rows
-        remain completed.
+        Org-agnostic (status lookup uses the shared knowledge schema).
         """
         if not self.initialized:
             await self.initialize()
@@ -571,7 +590,11 @@ async def delete_document(
         self,
         file_id: str,
     ) -> dict[str, Any]:
-        """Delete a document and its chunks from the knowledge base."""
+        """Delete a document and its chunks from the knowledge base.
+
+        Org-agnostic: file_id is globally unique in this schema. Access
+        control is enforced at the platform → RAG boundary.
+        """
         if not self.initialized:
             await self.initialize()
 
@@ -627,8 +650,7 @@ async def compare_documents(
     ) -> dict[str, Any] | None:
         """Compare two documents using deterministic paragraph-level diffing.
 
-        Fetches both documents in parallel. Returns structured diff with
-        change blocks, or an error dict when a document is not found.
+        Org-agnostic — operates on stored documents by file_id.
         """
         from .diff_service import compute_diff
 
@@ -640,7 +662,11 @@ async def compare_documents(
         if base is None:
             return {"error": "not_found", "file_id": base_file_id, "role": "base"}
         if comp is None:
-            return {"error": "not_found", "file_id": comparison_file_id, "role": "comparison"}
+            return {
+                "error": "not_found",
+                "file_id": comparison_file_id,
+                "role": "comparison",
+            }
 
         diff_result = compute_diff(
             base["content"],
@@ -663,6 +689,7 @@ async def compare_documents(
 
     async def compare_files(
         self,
+        org_slug: str,
         base_bytes: bytes,
         base_filename: str,
         comparison_bytes: bytes,
@@ -672,10 +699,10 @@ async def compare_files(
     ) -> dict[str, Any]:
         """Compare two uploaded files using deterministic paragraph-level diffing.
 
-        Extracts text directly from file bytes — no database storage or embedding.
-        Text extraction runs in parallel for both files via asyncio.gather.
+        Extracts text directly from file bytes — uses the org's vision
+        client for OCR-able formats. No database storage or embedding.
         """
-        self._maybe_refresh_clients()
+        clients = await self._ensure_org_clients(org_slug)
 
         from tale_knowledge.extraction import extract_text
 
@@ -684,8 +711,12 @@ async def compare_files(
         t0 = time.time()
 
         (base_text, _), (comp_text, _) = await asyncio.gather(
-            extract_text(base_bytes, base_filename, vision_client=self._vision_client),
-            extract_text(comparison_bytes, comparison_filename, vision_client=self._vision_client),
+            extract_text(base_bytes, base_filename, vision_client=clients.vision_client),
+            extract_text(
+                comparison_bytes,
+                comparison_filename,
+                vision_client=clients.vision_client,
+            ),
         )
 
         extraction_ms = (time.time() - t0) * 1000
@@ -713,7 +744,31 @@ async def compare_files(
         return result
 
     async def shutdown(self) -> None:
-        """Clean shutdown — close pool."""
+        """Clean shutdown — close pool and all per-org clients."""
+        # Best-effort close of each org's clients before tearing down the pool.
+        for org_slug, clients in list(self._org_clients.items()):
+            try:
+                await clients.embedding_service.close()
+            except Exception:
+                logger.warning(
+                    "Failed to close embedding_service for org '{}'",
+                    org_slug,
+                    exc_info=True,
+                )
+            try:
+                await clients.openai_client.close()
+            except Exception:
+                logger.warning("Failed to close openai_client for org '{}'", org_slug, exc_info=True)
+            if clients.vision_client is not None:
+                try:
+                    await clients.vision_client.close()
+                except Exception:
+                    logger.warning(
+                        "Failed to close vision_client for org '{}'",
+                        org_slug,
+                        exc_info=True,
+                    )
+        self._org_clients.clear()
         await close_pool()
         self.initialized = False
 
diff --git a/services/rag/tests/test_background_ingest.py b/services/rag/tests/test_background_ingest.py
index 819fb81d7c..f7357dd15b 100644
--- a/services/rag/tests/test_background_ingest.py
+++ b/services/rag/tests/test_background_ingest.py
@@ -17,6 +17,8 @@
 
 import pytest
 
+TEST_ORG = "test-org"
+
 pytestmark = pytest.mark.asyncio
 
 
@@ -198,6 +200,7 @@ async def test_successful_ingestion(self):
         ):
             mock_rag.add_document = AsyncMock(return_value=add_result)
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -223,6 +226,7 @@ async def test_skipped_content_marks_completed(self):
         ):
             mock_rag.add_document = AsyncMock(return_value=add_result)
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -247,6 +251,7 @@ async def test_non_skipped_does_not_call_mark_completed(self):
         ):
             mock_rag.add_document = AsyncMock(return_value=add_result)
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -264,6 +269,7 @@ async def test_ingestion_failure_records_sanitized_error(self):
         ):
             mock_rag.add_document = AsyncMock(side_effect=RuntimeError("x" * 1000))
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -287,6 +293,7 @@ async def test_record_failure_error_does_not_propagate(self):
         ):
             mock_rag.add_document = AsyncMock(side_effect=ValueError("ingestion failed"))
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -313,6 +320,7 @@ async def test_forwards_source_timestamps_to_add_document(self):
         ):
             mock_rag.add_document = AsyncMock(return_value=add_result)
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
@@ -321,6 +329,7 @@ async def test_forwards_source_timestamps_to_add_document(self):
             )
 
         mock_rag.add_document.assert_awaited_once_with(
+            TEST_ORG,
             content=b"content",
             file_id="doc-1",
             filename="test.txt",
@@ -338,6 +347,7 @@ async def test_cleanup_memory_always_called(self):
         ):
             mock_rag.add_document = AsyncMock(side_effect=RuntimeError("boom"))
             await _background_ingest(
+                TEST_ORG,
                 b"content",
                 "doc-1",
                 "test.txt",
diff --git a/services/rag/tests/test_compare_files.py b/services/rag/tests/test_compare_files.py
index fc1e383c3e..29c8efe157 100644
--- a/services/rag/tests/test_compare_files.py
+++ b/services/rag/tests/test_compare_files.py
@@ -15,21 +15,49 @@
 
 pytestmark = pytest.mark.asyncio
 
+TEST_ORG = "test-org"
+
 
 def _make_service():
-    """Create a RagService with all internal dependencies pre-mocked."""
-    from app.services.rag_service import RagService
+    """Create a RagService with all internal dependencies pre-mocked.
+
+    Pre-seeds the per-org client cache for `TEST_ORG` so compare_files
+    doesn't trigger the lazy-init / provider-catalog path.
+    """
+    from app.services.rag_service import RagService, _OrgClients
 
     service = RagService()
     service.initialized = True
     service._pool = MagicMock()
-    service._embedding_service = AsyncMock()
-    service._vision_client = MagicMock()
-    service._search_service = AsyncMock()
-    service._openai_client = AsyncMock()
-    service._llm_config = {}
-    service._vision_config = None
-    service._last_config_check = time.monotonic()
+    service._pinned_dims = 1536
+
+    embedding = AsyncMock()
+    embedding.dimensions = 1536
+    openai_client = AsyncMock()
+    vision_client = MagicMock()
+    search_service = AsyncMock()
+
+    service._org_clients[TEST_ORG] = _OrgClients(
+        llm_config={
+            "model": "gpt-test",
+            "embedding_model": "embed-test",
+            "api_key": "k",
+            "base_url": "http://test",
+            "embedding_api_key": "k",
+            "embedding_base_url": "http://test",
+        },
+        vision_config=None,
+        embedding_service=embedding,
+        openai_client=openai_client,
+        vision_client=vision_client,
+        search_service=search_service,
+        last_check=time.monotonic(),
+    )
+    # Back-compat aliases for tests that grab mocks off the service.
+    service._search_service = search_service
+    service._openai_client = openai_client
+    service._embedding_service = embedding
+    service._vision_client = vision_client
     return service
 
 
@@ -45,6 +73,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
 
         with patch("tale_knowledge.extraction.extract_text", side_effect=mock_extract):
             result = await service.compare_files(
+                TEST_ORG,
                 b"Section 1\n\nOriginal clause.",
                 "base.txt",
                 b"Section 1\n\nModified clause.",
@@ -66,6 +95,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
 
         with patch("tale_knowledge.extraction.extract_text", side_effect=mock_extract):
             result = await service.compare_files(
+                TEST_ORG,
                 content,
                 "a.txt",
                 content,
@@ -90,6 +120,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
             pytest.raises(ValueError, match="No text could be extracted from base file"),
         ):
             await service.compare_files(
+                TEST_ORG,
                 b"",
                 "empty.txt",
                 b"content",
@@ -109,6 +140,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
             pytest.raises(ValueError, match="No text could be extracted from comparison file"),
         ):
             await service.compare_files(
+                TEST_ORG,
                 b"content",
                 "base.txt",
                 b"",
@@ -130,6 +162,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
 
         with patch("tale_knowledge.extraction.extract_text", side_effect=mock_extract):
             result = await service.compare_files(
+                TEST_ORG,
                 b"Section 1\n\nParagraph A.",
                 "base.txt",
                 b"Section 1\n\nParagraph B.",
@@ -152,6 +185,7 @@ async def mock_extract(content_bytes, filename, *, vision_client=None):
 
         with patch("tale_knowledge.extraction.extract_text", side_effect=mock_extract):
             result = await service.compare_files(
+                TEST_ORG,
                 base.encode(),
                 "base.txt",
                 comp.encode(),
@@ -205,6 +239,7 @@ async def test_happy_path(self):
             async with AsyncClient(transport=transport, base_url="http://test") as client:
                 response = await client.post(
                     "/api/v1/documents/compare-files",
+                    headers={"X-Tale-Org": TEST_ORG},
                     files={
                         "base_file": ("base.txt", b"Hello\n\nWorld", "text/plain"),
                         "comparison_file": ("comp.txt", b"Hello\n\nEarth", "text/plain"),
@@ -222,6 +257,7 @@ async def test_unsupported_extension(self):
             async with AsyncClient(transport=transport, base_url="http://test") as client:
                 response = await client.post(
                     "/api/v1/documents/compare-files",
+                    headers={"X-Tale-Org": TEST_ORG},
                     files={
                         "base_file": ("base.exe", b"binary", "application/octet-stream"),
                         "comparison_file": ("comp.txt", b"text", "text/plain"),
@@ -241,6 +277,7 @@ async def test_extraction_failure_returns_422(self):
             async with AsyncClient(transport=transport, base_url="http://test") as client:
                 response = await client.post(
                     "/api/v1/documents/compare-files",
+                    headers={"X-Tale-Org": TEST_ORG},
                     files={
                         "base_file": ("empty.pdf", b"fake-pdf", "application/pdf"),
                         "comparison_file": ("comp.pdf", b"fake-pdf", "application/pdf"),
diff --git a/services/rag/tests/test_config.py b/services/rag/tests/test_config.py
index 0b46d913bb..4b9f964047 100644
--- a/services/rag/tests/test_config.py
+++ b/services/rag/tests/test_config.py
@@ -26,7 +26,7 @@ class TestGetLlmConfig:
     def test_returns_valid_config(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["provider"] == "openai"
         assert config["api_key"] == "sk-test"
         assert config["base_url"] == "https://openrouter.ai/api/v1"
@@ -42,7 +42,7 @@ def test_missing_chat_model_raises(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No chat model"):
-                s.get_llm_config()
+                s.get_llm_config("default")
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -53,14 +53,14 @@ def test_missing_embedding_model_raises(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No embedding model"):
-                s.get_llm_config()
+                s.get_llm_config("default")
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
     @patch("tale_shared.config.base._provider_chat_model", return_value=_mock_chat_model())
     def test_optional_max_tokens_included_when_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {"RAG_OPENAI_MAX_TOKENS": "4096"}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["max_tokens"] == 4096
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -68,7 +68,7 @@ def test_optional_max_tokens_included_when_set(self, mock_chat, mock_embed):
     def test_optional_temperature_included_when_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {"RAG_OPENAI_TEMPERATURE": "0.7"}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["temperature"] == pytest.approx(0.7)
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -76,7 +76,7 @@ def test_optional_temperature_included_when_set(self, mock_chat, mock_embed):
     def test_max_tokens_omitted_when_not_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert "max_tokens" not in config
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -84,7 +84,7 @@ def test_max_tokens_omitted_when_not_set(self, mock_chat, mock_embed):
     def test_temperature_omitted_when_not_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert "temperature" not in config
 
 
@@ -93,7 +93,7 @@ class TestGetVisionModel:
     def test_returns_model_from_provider(self, mock_provider):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            assert s.get_vision_model() == "gpt-4o"
+            assert s.get_vision_model("default") == "gpt-4o"
 
     @patch(
         "tale_shared.config.base._provider_vision_model",
@@ -103,4 +103,4 @@ def test_missing_provider_raises(self, mock_provider):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No vision model"):
-                s.get_vision_model()
+                s.get_vision_model("default")
diff --git a/services/rag/tests/test_document_helpers.py b/services/rag/tests/test_document_helpers.py
index 512dc8cf27..5f2a710413 100644
--- a/services/rag/tests/test_document_helpers.py
+++ b/services/rag/tests/test_document_helpers.py
@@ -4,8 +4,8 @@
 - _validate_file_extension: supported, unsupported, no extension
 - _parse_metadata: valid JSON, invalid JSON, non-dict JSON, None
 - SUPPORTED_EXTENSIONS: excludes legacy Office formats (.doc, .ppt, .xls)
-- Settings.get_embedding_dimensions(): via provider files
-- Settings.get_llm_config(): via provider files
+- Settings.get_embedding_dimensions("default"): via provider files
+- Settings.get_llm_config("default"): via provider files
 """
 
 import os
@@ -204,13 +204,13 @@ def _mock_embedding_model():
 
 
 class TestGetEmbeddingDimensions:
-    """Settings.get_embedding_dimensions() from provider files."""
+    """Settings.get_embedding_dimensions("default") from provider files."""
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
     def test_valid_dimensions(self, mock_provider):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            assert s.get_embedding_dimensions() == 1536
+            assert s.get_embedding_dimensions("default") == 1536
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -219,7 +219,7 @@ def test_valid_dimensions(self, mock_provider):
     def test_large_dimensions(self, mock_provider):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            assert s.get_embedding_dimensions() == 3072
+            assert s.get_embedding_dimensions("default") == 3072
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -229,18 +229,18 @@ def test_missing_provider_raises(self, mock_provider):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No embedding model"):
-                s.get_embedding_dimensions()
+                s.get_embedding_dimensions("default")
 
 
 class TestGetLlmConfig:
-    """Settings.get_llm_config() from provider files."""
+    """Settings.get_llm_config("default") from provider files."""
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
     @patch("tale_shared.config.base._provider_chat_model", return_value=_mock_chat_model())
     def test_all_present_returns_valid_config(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["provider"] == "openai"
         assert config["api_key"] == "sk-test"
         assert config["base_url"] == "https://openrouter.ai/api/v1"
@@ -256,7 +256,7 @@ def test_missing_chat_model_raises(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No chat model"):
-                s.get_llm_config()
+                s.get_llm_config("default")
 
     @patch(
         "tale_shared.config.base._provider_embedding_model",
@@ -267,14 +267,14 @@ def test_missing_embedding_model_raises(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
             with pytest.raises(ValueError, match="No embedding model"):
-                s.get_llm_config()
+                s.get_llm_config("default")
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
     @patch("tale_shared.config.base._provider_chat_model", return_value=_mock_chat_model())
     def test_optional_max_tokens_included_when_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {"RAG_OPENAI_MAX_TOKENS": "4096"}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["max_tokens"] == 4096
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -282,7 +282,7 @@ def test_optional_max_tokens_included_when_set(self, mock_chat, mock_embed):
     def test_optional_temperature_included_when_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {"RAG_OPENAI_TEMPERATURE": "0.7"}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert config["temperature"] == pytest.approx(0.7)
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -290,7 +290,7 @@ def test_optional_temperature_included_when_set(self, mock_chat, mock_embed):
     def test_max_tokens_omitted_when_not_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert "max_tokens" not in config
 
     @patch("tale_shared.config.base._provider_embedding_model", return_value=_mock_embedding_model())
@@ -298,5 +298,5 @@ def test_max_tokens_omitted_when_not_set(self, mock_chat, mock_embed):
     def test_temperature_omitted_when_not_set(self, mock_chat, mock_embed):
         with patch.dict(os.environ, {}, clear=True):
             s = Settings()
-            config = s.get_llm_config()
+            config = s.get_llm_config("default")
         assert "temperature" not in config
diff --git a/services/rag/tests/test_rag_service.py b/services/rag/tests/test_rag_service.py
index e80c594d84..e5b26bccb7 100644
--- a/services/rag/tests/test_rag_service.py
+++ b/services/rag/tests/test_rag_service.py
@@ -18,24 +18,52 @@
 
 pytestmark = pytest.mark.asyncio
 
+TEST_ORG = "test-org"
+
 
 def _make_service():
     """Create a RagService with all internal dependencies pre-mocked.
 
-    Bypasses initialize() by directly setting the internal state.
+    Bypasses initialize() by directly setting the internal state, and
+    pre-seeds the per-org client cache for `TEST_ORG` so tests don't
+    have to drive the lazy-init path.
     """
-    from app.services.rag_service import RagService
+    from app.services.rag_service import RagService, _OrgClients
 
     service = RagService()
     service.initialized = True
     service._pool = MagicMock()
-    service._embedding_service = AsyncMock()
-    service._vision_client = MagicMock()
-    service._search_service = AsyncMock()
-    service._openai_client = AsyncMock()
-    service._llm_config = {}
-    service._vision_config = None
-    service._last_config_check = time.monotonic()
+    service._pinned_dims = 1536
+
+    embedding = AsyncMock()
+    embedding.dimensions = 1536
+    openai_client = AsyncMock()
+    vision_client = MagicMock()
+    search_service = AsyncMock()
+
+    service._org_clients[TEST_ORG] = _OrgClients(
+        llm_config={
+            "model": "gpt-test",
+            "embedding_model": "embed-test",
+            "api_key": "k",
+            "base_url": "http://test",
+            "embedding_api_key": "k",
+            "embedding_base_url": "http://test",
+        },
+        vision_config=None,
+        embedding_service=embedding,
+        openai_client=openai_client,
+        vision_client=vision_client,
+        search_service=search_service,
+        last_check=time.monotonic(),
+    )
+    # Back-compat aliases for tests that grab the mocks directly off the
+    # service. Both names point at the SAME mock instance the per-org
+    # cache uses, so setup-then-assert via either attribute works.
+    service._search_service = search_service
+    service._openai_client = openai_client
+    service._embedding_service = embedding
+    service._vision_client = vision_client
     return service
 
 
@@ -77,6 +105,7 @@ async def test_user_calls_index_document(self):
             "app.services.rag_service.index_document", new_callable=AsyncMock, return_value=index_result
         ) as mock_idx:
             result = await service.add_document(
+                TEST_ORG,
                 b"content bytes",
                 "doc-1",
                 "report.pdf",
@@ -99,6 +128,7 @@ async def test_skipped_returns_skipped(self):
 
         with patch("app.services.rag_service.index_document", new_callable=AsyncMock, return_value=index_result):
             result = await service.add_document(
+                TEST_ORG,
                 b"content",
                 "doc-skip",
                 "file.txt",
@@ -108,7 +138,12 @@ async def test_skipped_returns_skipped(self):
         assert result["skip_reason"] == "content_unchanged"
 
     async def test_initializes_if_not_initialized(self):
-        from app.services.rag_service import RagService
+        """`add_document` triggers `initialize()` (sets up the DB pool)
+        on the first call. Under the multi-org refactor, per-org client
+        construction is deferred even further (lazy on first call for
+        that org), so we pre-seed the cache to bypass _ensure_org_clients
+        and only verify the DB-pool initialize gate fires."""
+        from app.services.rag_service import RagService, _OrgClients
 
         service = RagService()
         assert service.initialized is False
@@ -117,13 +152,31 @@ async def test_initializes_if_not_initialized(self):
 
             def _fake_init():
                 service.initialized = True
-                service._last_config_check = time.monotonic()
-                service._llm_config = {}
-                service._vision_config = None
+                # Pre-seed per-org cache so the inner _ensure_org_clients
+                # call inside add_document doesn't try to read a real
+                # provider catalog from disk.
+                embedding = AsyncMock()
+                embedding.dimensions = 1536
+                service._pinned_dims = 1536
+                service._org_clients[TEST_ORG] = _OrgClients(
+                    llm_config={
+                        "model": "gpt",
+                        "embedding_model": "embed",
+                        "api_key": "k",
+                        "base_url": "u",
+                        "embedding_api_key": "k",
+                        "embedding_base_url": "u",
+                    },
+                    vision_config=None,
+                    embedding_service=embedding,
+                    openai_client=AsyncMock(),
+                    vision_client=MagicMock(),
+                    search_service=AsyncMock(),
+                    last_check=time.monotonic(),
+                )
 
             mock_init.side_effect = _fake_init
             service._pool = MagicMock()
-            service._embedding_service = AsyncMock()
 
             with patch(
                 "app.services.rag_service.index_document",
@@ -136,7 +189,7 @@ def _fake_init():
                     "skip_reason": "x",
                 },
             ):
-                await service.add_document(b"x", "d", "f.txt")
+                await service.add_document(TEST_ORG, b"x", "d", "f.txt")
 
         mock_init.assert_awaited_once()
 
@@ -156,7 +209,7 @@ async def test_delegates_to_search_service(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.0
-            results = await service.search("test query", file_ids=["doc-1"])
+            results = await service.search(TEST_ORG, "test query", file_ids=["doc-1"])
 
         assert len(results) == 2
         service._search_service.search.assert_awaited_once_with(
@@ -173,7 +226,7 @@ async def test_applies_similarity_threshold(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.7
-            await service.search("query")
+            await service.search(TEST_ORG, "query")
 
         # Threshold is now passed to search_service for vector pre-filtering
         service._search_service.search.assert_awaited_once_with(
@@ -190,7 +243,7 @@ async def test_custom_top_k_overrides_settings(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 5
             mock_settings.similarity_threshold = 0.0
-            await service.search("query", top_k=20)
+            await service.search(TEST_ORG, "query", top_k=20)
 
         service._search_service.search.assert_awaited_once_with(
             "query",
@@ -210,7 +263,7 @@ async def test_custom_threshold_overrides_settings(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.9
-            results = await service.search("query", similarity_threshold=0.3)
+            results = await service.search(TEST_ORG, "query", similarity_threshold=0.3)
 
         assert len(results) == 1
 
@@ -225,7 +278,7 @@ async def test_zero_threshold_returns_all(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.0
-            results = await service.search("query")
+            results = await service.search(TEST_ORG, "query")
 
         assert len(results) == 1
 
@@ -237,7 +290,7 @@ async def test_passes_file_ids(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.0
-            await service.search("q", file_ids=["doc-1", "doc-2"])
+            await service.search(TEST_ORG, "q", file_ids=["doc-1", "doc-2"])
 
         service._search_service.search.assert_awaited_once_with(
             "q",
@@ -272,7 +325,7 @@ async def test_generates_response_with_search_results(self):
             patch("app.services.rag_service.settings") as mock_settings,
         ):
             mock_settings.get_llm_config.return_value = {"model": "gpt-4o-mini"}
-            result = await service.generate("What is X?", file_ids=["doc-1"])
+            result = await service.generate(TEST_ORG, "What is X?", file_ids=["doc-1"])
 
         assert result["success"] is True
         assert result["response"] == "Generated answer based on context."
@@ -288,7 +341,7 @@ async def test_empty_search_results_returns_no_info_message(self):
             new_callable=AsyncMock,
             return_value=[],
         ):
-            result = await service.generate("Unknown topic?")
+            result = await service.generate(TEST_ORG, "Unknown topic?")
 
         assert result["success"] is False
         assert "No relevant information" in result["response"]
@@ -316,7 +369,7 @@ async def test_llm_receives_system_prompt_and_context(self):
             patch("app.services.rag_service.settings") as mock_settings,
         ):
             mock_settings.get_llm_config.return_value = {"model": "test-model"}
-            await service.generate("What?")
+            await service.generate(TEST_ORG, "What?")
 
         create_call = service._openai_client.chat.completions.create
         messages = create_call.call_args[1]["messages"]
@@ -344,7 +397,7 @@ async def test_empty_llm_choices_raises(self):
         ):
             mock_settings.get_llm_config.return_value = {"model": "m"}
             with pytest.raises(ValueError, match="empty choices"):
-                await service.generate("question")
+                await service.generate(TEST_ORG, "question")
 
     async def test_context_truncated_at_max_chars(self):
         from app.services.rag_service import RAG_MAX_CONTEXT_CHARS
@@ -364,7 +417,7 @@ async def test_context_truncated_at_max_chars(self):
             patch("app.services.rag_service.settings") as mock_settings,
         ):
             mock_settings.get_llm_config.return_value = {"model": "m"}
-            result = await service.generate("query")
+            result = await service.generate(TEST_ORG, "query")
 
         create_call = service._openai_client.chat.completions.create
         user_msg = create_call.call_args[1]["messages"][1]["content"]
@@ -374,7 +427,7 @@ async def test_passes_file_ids_to_search(self):
         service = _make_service()
 
         with patch.object(service, "search", new_callable=AsyncMock, return_value=[]) as mock_search:
-            await service.generate("q", file_ids=["doc-1"])
+            await service.generate(TEST_ORG, "q", file_ids=["doc-1"])
 
         mock_search.assert_awaited_once()
         call_kwargs = mock_search.call_args[1]
@@ -399,7 +452,7 @@ async def test_none_content_from_llm_returns_empty_string(self):
             patch("app.services.rag_service.settings") as mock_settings,
         ):
             mock_settings.get_llm_config.return_value = {"model": "m"}
-            result = await service.generate("q")
+            result = await service.generate(TEST_ORG, "q")
 
         assert result["response"] == ""
         assert result["success"] is True
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 6f235fe576..3166dde853 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -757,9 +757,15 @@ async function syncProjectFiles(
       `${containerName}:/app/data/`,
     ]);
 
+    // docker cp is non-atomic across the multi-org staging dir: a failure
+    // here means a partial push may have landed in the container. Throw
+    // so the outer `deployToContainer` flow exits non-zero instead of
+    // printing `success('Deployment complete!')` over a half-pushed state.
     if (!result.success) {
-      logger.error(`Failed to override config: ${result.stderr}`);
-      return;
+      throw new Error(
+        `--override docker cp into ${containerName} failed: ${result.stderr?.trim() ?? '(no stderr)'}. ` +
+          `Partial push possible; re-run --override after addressing the cause.`,
+      );
     }
 
     // docker cp copies files as root — fix ownership so the app user can write
@@ -772,6 +778,9 @@ async function syncProjectFiles(
       `/app/data/`,
     ]);
     if (!chownResult.success) {
+      // Ownership fix failure isn't necessarily a push failure (files
+      // landed, just wrong owner), but warn loudly — the app user won't
+      // be able to write to its own data tree.
       logger.warn(
         `Failed to fix ownership on /app/data: ${chownResult.stderr}`,
       );
@@ -799,23 +808,59 @@ async function syncProjectFiles(
 //
 // All directory exclusions prune the entire subtree; `fs.cp` recurses past
 // the filter for any directory the filter returned `true` for. Root-level
-// non-org junk (`.tale/`, `.git/`, `.env`, IDE configs, dotfiles, etc.) is
-// excluded one level up — only org-shaped dirs from `findOrgDirs` reach
-// this function — so the filter here only handles depth-1+ skips.
+// non-org junk is excluded one level up, BUT the same kinds of junk can
+// also appear INSIDE an org dir (e.g. operator commits their workspace as
+// a git repo → `default/.git/`; macOS sprinkles `default/.DS_Store`;
+// someone runs `npm i` in their providers folder → `default/node_modules/`).
+// Filter them here so they never reach `/app/data/<org>/`.
+const STAGED_DOTFILE_DENYLIST = new Set<string>([
+  // Belt-and-suspenders for things we already filter via startsWith('.'),
+  // but listing them makes intent explicit.
+  '.git',
+  '.tale',
+  '.vscode',
+  '.idea',
+  '.DS_Store',
+]);
+const STAGED_NAME_DENYLIST = new Set<string>(['node_modules', '__pycache__']);
 async function stageOrgIntoDir(srcDir: string, destDir: string): Promise<void> {
   await cp(srcDir, destDir, {
     recursive: true,
     filter: (src) => {
       const base = src.split(/[\\/]/).pop() ?? '';
+      // `.history` and `*.secrets.json` are content-preserving filters by
+      // design — survive overwrites on the server side, so we never push
+      // them. Dotfiles (including `.git/`, `.DS_Store`, editor swap files,
+      // etc.) are operator-host junk that should never reach the data
+      // tree. node_modules / __pycache__ catch any non-dotfile package-mgr
+      // litter inside an org dir.
       if (base === '.history') return false;
       if (base.endsWith('.secrets.json')) return false;
+      if (base.startsWith('.')) return false;
+      if (STAGED_DOTFILE_DENYLIST.has(base)) return false;
+      if (STAGED_NAME_DENYLIST.has(base)) return false;
       // lstat is sync here because fs.cp's filter is sync. Symlinks at
       // any depth are skipped; missing entries (ENOENT) also skip rather
       // than throw — fs.cp re-races stat() so any race is benign.
       try {
         const info = lstatSync(src);
         if (info.isSymbolicLink()) return false;
-      } catch {
+      } catch (err: unknown) {
+        // ENOENT on a sibling stat is benign; anything else is worth a
+        // warning so a real permission/IO problem doesn't silently drop
+        // a file.
+        const code =
+          err !== null &&
+          typeof err === 'object' &&
+          'code' in err &&
+          typeof err.code === 'string'
+            ? err.code
+            : undefined;
+        if (code !== 'ENOENT') {
+          console.warn(
+            `[deploy.stageOrgIntoDir] lstat ${src} failed (${code ?? 'unknown'}); skipping`,
+          );
+        }
         return false;
       }
       return true;
diff --git a/tools/cli/src/lib/actions/init.ts b/tools/cli/src/lib/actions/init.ts
index 4cffcbdc8c..c69c23fb67 100644
--- a/tools/cli/src/lib/actions/init.ts
+++ b/tools/cli/src/lib/actions/init.ts
@@ -292,7 +292,9 @@ export async function init(options: InitOptions): Promise<void> {
         { apiKey: envResult.openrouterKey },
         envResult.agePublicKey,
       );
-      await writeFile(secretsPath, encrypted);
+      // 0600: SOPS-encrypted, but least-privilege convention for any
+      // `*.secrets.*` file. Limits readability to the owner.
+      await writeFile(secretsPath, encrypted, { mode: 0o600 });
       logger.success(
         'Encrypted provider API key into default/providers/openrouter.secrets.json',
       );
@@ -353,7 +355,21 @@ async function detectTaleProjectFiles(dir: string): Promise<string[]> {
   try {
     const entries = await readdir(dir);
     return entries.filter((entry) => TALE_PROJECT_MARKERS.has(entry));
-  } catch {
+  } catch (err: unknown) {
+    // Most common case: target dir does not exist yet (`tale init` in a
+    // fresh empty dir, or a path the operator just typed). Treat as
+    // empty — non-ENOENT errors are worth a warning so a perms issue
+    // doesn't masquerade as a clean slate.
+    const code =
+      err !== null &&
+      typeof err === 'object' &&
+      'code' in err &&
+      typeof err.code === 'string'
+        ? err.code
+        : undefined;
+    if (code !== 'ENOENT') {
+      console.warn(`[init.detectTaleProjectFiles] readdir ${dir} failed:`, err);
+    }
     return [];
   }
 }
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index 170c819a19..a3e9699001 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -5,12 +5,20 @@
  * scripts/2026-03-28-migrate-convex-data.sh:120-131 (source env.sh,
  * ensure_instance_secret, compute admin key inline, run convex CLI).
  *
- * Destructive: factory-reseeds every org's non-secret config from the
- * builtin catalog. `*.secrets.json` files and `.history/` trails are
- * preserved server-side by `scaffoldNewOrganization({override:true})`.
- * Uploaded branding `images/` survive (branding is treated as a tree
- * with per-file overwrite). Everything else under each `<org>/<domain>/`
- * is overwritten with builtin content.
+ * Destructive: factory-reseeds every registered org's non-secret config
+ * from the builtin catalog. `*.secrets.json` files and `.history/` trails
+ * are preserved server-side by `scaffoldNewOrganization({override:true,
+ * strict:true})`. Uploaded branding `images/` survive (branding is
+ * treated as a tree with per-file overwrite). Everything else under each
+ * `<org>/<domain>/` is overwritten with builtin content.
+ *
+ * Filesystem-only org subtrees (no Better Auth row) are NOT touched —
+ * `--override-all` means "all registered orgs", not "every dir on disk".
+ *
+ * Failure semantics: the convex-side action throws on any per-org failure
+ * (so `bunx convex run` exits non-zero), which surfaces as
+ * `result.success === false` here and is converted to a CLI throw with
+ * the per-org detail attached.
  */
 
 import { confirm } from '../../utils/confirm';
@@ -29,6 +37,12 @@ export interface ReseedAllOrgsOptions {
  * `INSTANCE_SECRET` is guaranteed populated and the admin key derivation
  * matches the entrypoint's own runtime computation.
  *
+ * `--no-push` skips a redundant push step (we're calling an existing
+ * deployed action). The trailing `grep -v` strips `bunx convex run`'s
+ * decorative banner output ("Admin key", "📋", "✅ Admin", separators,
+ * blank lines, etc.) so the final stdout is the action's JSON return
+ * value alone — parseable in TypeScript.
+ *
  * Runtime workdir is `/app` (services/platform/Dockerfile sets
  * `WORKDIR /app`; flattens services/platform/{convex,lib,env.sh,…} into
  * `/app/`). No `cd /app/services/platform` — that path does not exist
@@ -44,15 +58,60 @@ cd /app
 HOME=/home/app timeout 1800 bunx convex run \\
   organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin \\
   --url "\${CONVEX_URL:-http://convex:3210}" \\
-  --admin-key "$ADMIN_KEY"
+  --admin-key "$ADMIN_KEY" \\
+  --no-push 2>&1 \\
+  | grep -v "^Admin key\\|^📋\\|^✅ Admin\\|^━\\|^🌐\\|^$\\|Steps:\\|Open\\|Enter\\|Paste"
 `;
 
 const CONFIRM_MESSAGE =
-  '--override-all will factory-reset every org from the builtin catalog. ' +
+  '--override-all will factory-reset every registered org from the builtin catalog. ' +
   '*.secrets.json files, .history/ trails, and uploaded branding/images/ are preserved; ' +
   'all other config (model lists, agents, workflows, skills, integrations, branding.json, retention.json) ' +
   'is overwritten. Proceed?';
 
+type ReseedResult = {
+  total: number;
+  succeeded: number;
+  failed: number;
+  results: Array<
+    | { slug: string; status: 'ok' }
+    | { slug: string; status: 'error'; error: string }
+  >;
+};
+
+/**
+ * Extract the last JSON object from a stream of mixed-output stdout.
+ * `bunx convex run` prints `null` for void-returning actions or the
+ * action's return value for value-returning ones. Either way, the JSON
+ * payload is on its own line(s) at the very end.
+ */
+function parseTrailingJson(stdout: string): ReseedResult | null {
+  const trimmed = stdout.trim();
+  if (!trimmed) return null;
+
+  // Walk backwards from the end looking for the start of a JSON value.
+  // The action returns an object, so look for the matching `{`.
+  const lastBrace = trimmed.lastIndexOf('{');
+  if (lastBrace < 0) return null;
+
+  try {
+    const parsed = JSON.parse(trimmed.slice(lastBrace));
+    if (
+      parsed &&
+      typeof parsed === 'object' &&
+      typeof parsed.total === 'number' &&
+      typeof parsed.succeeded === 'number' &&
+      typeof parsed.failed === 'number' &&
+      Array.isArray(parsed.results)
+    ) {
+      return parsed as ReseedResult;
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
 export async function reseedAllOrgsFromBuiltin(
   options: ReseedAllOrgsOptions,
 ): Promise<void> {
@@ -78,16 +137,16 @@ export async function reseedAllOrgsFromBuiltin(
   if (dryRun) {
     logger.blank();
     logger.info('[DRY-RUN] Would run:');
-    logger.info(`  docker exec ${container} bash -lc '<reseed script>'`);
-    logger.info('Reseed script body (would be piped into bash):');
+    logger.info(`  docker exec -i ${container} bash -s <<'EOF'`);
     for (const line of RESEED_SCRIPT.split('\n')) {
       logger.info(`  ${line}`);
     }
+    logger.info(`  EOF`);
     return;
   }
 
   logger.blank();
-  logger.step('Reseeding builtin catalog into all orgs...');
+  logger.step('Reseeding builtin catalog into all registered orgs...');
 
   // Pipe the script via stdin instead of embedding in argv — avoids shell
   // escaping pitfalls and keeps the script source readable.
@@ -95,21 +154,33 @@ export async function reseedAllOrgsFromBuiltin(
     stdin: RESEED_SCRIPT,
   });
 
+  // The convex action throws on any per-org failure, which propagates to
+  // `bunx convex run`'s exit code, which propagates to `docker exec`'s
+  // exit code, which becomes `result.success === false` here.
   if (!result.success) {
+    if (result.stdout) {
+      logger.info(result.stdout.trim());
+    }
     if (result.stderr) {
       logger.error(result.stderr.trim());
     }
     throw new Error(
-      `--override-all failed (docker exec into ${container} returned non-zero).`,
+      `--override-all failed: reseed action raised in ${container}. ` +
+        `Per-org detail above; partial state on disk — re-run --override-all ` +
+        `after addressing failures (the action is idempotent).`,
     );
   }
 
-  // The action's return value is printed to stdout by `bunx convex run`.
-  if (result.stdout) {
-    const trimmed = result.stdout.trim();
-    if (trimmed) {
-      logger.info(trimmed);
-    }
+  // All orgs succeeded. Parse and summarize.
+  const parsed = parseTrailingJson(result.stdout);
+  if (parsed) {
+    logger.info(
+      `Reseeded ${parsed.succeeded}/${parsed.total} orgs from builtin catalog.`,
+    );
+  } else if (result.stdout) {
+    // Couldn't parse — surface raw stdout so the operator isn't flying
+    // blind. Should be rare given the grep strip above.
+    logger.info(result.stdout.trim());
   }
 
   logger.success('Reseed complete.');
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index 47cdecdf9a..2f86fa701e 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -147,6 +147,38 @@ export async function start(options: StartOptions): Promise<void> {
     }
   }
 
+  // Detect legacy flat-layout dirs at the project root (`agents/`,
+  // `workflows/`, …). Under the org-first layout these belong under
+  // `default/<domain>/` instead — the platform's resolvers won't read
+  // anything at the old paths. Surface the runbook so the operator
+  // doesn't boot into a "nothing's working" state.
+  const LEGACY_FLAT_DOMAINS = [
+    'agents',
+    'workflows',
+    'integrations',
+    'branding',
+    'providers',
+    'skills',
+  ];
+  const legacyDirsFound = LEGACY_FLAT_DOMAINS.filter((d) =>
+    existsSync(join(projectDir, d)),
+  );
+  if (legacyDirsFound.length > 0) {
+    logger.warn(
+      `Legacy flat layout detected at project root: ${legacyDirsFound.map((d) => `${d}/`).join(', ')}`,
+    );
+    logger.info(
+      '  The org-first layout expects these under `default/<domain>/` (or another org subtree).',
+    );
+    logger.info(
+      '  Migrate with: `tale migrate config-layout` then `tale deploy --override-all -y`.',
+    );
+    logger.info(
+      '  See docs/<locale>/self-hosted/operate/upgrades.md for the full runbook.',
+    );
+    logger.blank();
+  }
+
   await assertDockerAvailable();
 
   // Resolve project ID from tale.json before any Docker-resource naming.
@@ -242,10 +274,10 @@ export async function start(options: StartOptions): Promise<void> {
     }
     logger.blank();
     logger.info(
-      'Agents, workflows, integrations, and branding are bind-mounted from your project.',
+      'Per-org config (`<org>/agents/`, `<org>/workflows/`, `<org>/integrations/`, `<org>/branding/`, `<org>/providers/`, `<org>/skills/`)',
     );
     logger.info(
-      'Edits to agents/, workflows/, integrations/, and branding/ will auto-refresh the browser.',
+      'is bind-mounted from your project. Edits to those paths auto-refresh the browser.',
     );
     logger.blank();
     logger.info(`Stop with: docker compose -p ${getProjectId()}-dev down`);
diff --git a/tools/cli/src/lib/actions/update.ts b/tools/cli/src/lib/actions/update.ts
index 11f4ad7fbb..8d4161d363 100644
--- a/tools/cli/src/lib/actions/update.ts
+++ b/tools/cli/src/lib/actions/update.ts
@@ -73,43 +73,84 @@ export async function update(options: UpdateOptions): Promise<void> {
     await fetchReference(projectDir);
   }
 
-  // Regenerate AI rules files
+  // Read existing checksums BEFORE rewriting rules so we can apply the
+  // same modified/unmodified policy as example files.
+  const oldChecksums = await readChecksums(projectDir);
+  const oldFiles = oldChecksums?.files ?? {};
+
+  // Regenerate AI rules files. Same protection policy as examples:
+  // - new file → write
+  // - deleted by user → skip
+  // - unmodified-since-last-update → overwrite
+  // - locally modified + no --force → keep, warn
+  // - locally modified + --force → overwrite
   logger.step(`${prefix}Updating AI rules files...`);
-  if (!options.dryRun) {
-    const rulesFiles = generateAllRules();
-    for (const { relativePath, content } of rulesFiles) {
-      const destPath = join(projectDir, relativePath);
-      await mkdir(dirname(destPath), { recursive: true });
-      await writeFile(destPath, content);
+  const rulesFiles = generateAllRules();
+  const rulesUpdates: Record<string, string> = {};
+  for (const { relativePath, content } of rulesFiles) {
+    const destPath = join(projectDir, relativePath);
+    const newHash = computeContentHash(content);
+    const oldHash = oldFiles[relativePath];
+
+    if (!oldHash) {
+      logger.info(`${prefix}+ ${relativePath} (new)`);
+      if (!options.dryRun) {
+        await mkdir(dirname(destPath), { recursive: true });
+        await writeFile(destPath, content);
+      }
+      rulesUpdates[relativePath] = newHash;
+    } else if (!existsSync(destPath)) {
+      logger.info(`${prefix}- ${relativePath} (deleted by user, skipping)`);
+    } else {
+      const currentHash = await computeFileHash(destPath);
+      if (currentHash === oldHash) {
+        logger.info(`${prefix}~ ${relativePath} (updated)`);
+        if (!options.dryRun) {
+          await writeFile(destPath, content);
+        }
+        rulesUpdates[relativePath] = newHash;
+      } else if (options.force) {
+        logger.warn(
+          `${prefix}~ ${relativePath} (overwritten, was locally modified)`,
+        );
+        if (!options.dryRun) {
+          await writeFile(destPath, content);
+        }
+        rulesUpdates[relativePath] = newHash;
+      } else {
+        logger.warn(
+          `${prefix}⚠ Skipped ${relativePath} (locally modified). Re-run with --force to overwrite.`,
+        );
+        rulesUpdates[relativePath] = oldHash;
+      }
     }
   }
 
-  // Read existing checksums
-  const oldChecksums = await readChecksums(projectDir);
-  const oldFiles = oldChecksums?.files ?? {};
-
-  // Get new example files from embedded data
+  // Get new example files from embedded data. Paths land under
+  // `default/<domain>/...` to match the org-first layout that
+  // `tale init` scaffolds.
   const newExampleFiles = new Map<string, string>();
+  const DEFAULT_ORG = 'default';
 
   for (const [relPath, content] of getEmbeddedExamples('agents')) {
-    newExampleFiles.set(join('agents', relPath), content);
+    newExampleFiles.set(join(DEFAULT_ORG, 'agents', relPath), content);
   }
   for (const [relPath, content] of getEmbeddedExamples('workflows')) {
-    newExampleFiles.set(join('workflows', relPath), content);
+    newExampleFiles.set(join(DEFAULT_ORG, 'workflows', relPath), content);
   }
   for (const [relPath, content] of getEmbeddedExamples('integrations')) {
-    newExampleFiles.set(join('integrations', relPath), content);
+    newExampleFiles.set(join(DEFAULT_ORG, 'integrations', relPath), content);
   }
   for (const [relPath, content] of getEmbeddedExamples('branding')) {
-    newExampleFiles.set(join('branding', relPath), content);
+    newExampleFiles.set(join(DEFAULT_ORG, 'branding', relPath), content);
   }
   for (const [relPath, content] of getEmbeddedExamples('providers')) {
     if (!relPath.endsWith('.secrets.json')) {
-      newExampleFiles.set(join('providers', relPath), content);
+      newExampleFiles.set(join(DEFAULT_ORG, 'providers', relPath), content);
     }
   }
   for (const [relPath, content] of getEmbeddedExamples('skills')) {
-    newExampleFiles.set(join('skills', relPath), content);
+    newExampleFiles.set(join(DEFAULT_ORG, 'skills', relPath), content);
   }
 
   // Classify and apply changes
@@ -120,7 +161,9 @@ export async function update(options: UpdateOptions): Promise<void> {
     removed: [],
   };
 
-  const newChecksumFiles: Record<string, string> = {};
+  // Seed checksum map with the rules-file decisions so the final write
+  // includes their hashes (so future updates can detect local edits).
+  const newChecksumFiles: Record<string, string> = { ...rulesUpdates };
 
   for (const [relPath, content] of newExampleFiles) {
     const destPath = join(projectDir, relPath);
diff --git a/tools/cli/src/lib/compose/generators/constants.ts b/tools/cli/src/lib/compose/generators/constants.ts
index 616e81ae61..a192ff98df 100644
--- a/tools/cli/src/lib/compose/generators/constants.ts
+++ b/tools/cli/src/lib/compose/generators/constants.ts
@@ -5,8 +5,12 @@ export const DEV_VOLUME_NAMES = [
   'db-data',
   'db-backup',
   'rag-data',
-  // Retained for legacy migration (used by `tale migrate split-convex` to
-  // locate pre-split data). Not mounted by any container after Phase 2.
+  // Legacy: pre-0.3.0 deployments split platform and convex data; today
+  // everything lives in `convex-data`. The volume is retained as an
+  // unused stub so the detect() probe in start.ts can identify pre-0.3.0
+  // deployments and produce a coherent diff. Operators can delete it
+  // by hand once they're past the upgrade window. Do not remove this
+  // entry without coordinating with that detect() heuristic.
   'platform-data',
   'convex-data',
   'caddy-data',
@@ -18,9 +22,7 @@ export const DEV_VOLUME_NAMES = [
 // Every volume declared as `external: true` in the stateful or color compose
 // must appear here so `ensureVolumes` pre-creates it.
 export const REQUIRED_VOLUMES = [
-  // platform-data is kept for upgrade scenarios where split-convex migrates
-  // its contents into convex-data; on fresh installs it is an unused empty
-  // volume. Removing it would break detect() for pre-0.3.0 deployments.
+  // See DEV_VOLUME_NAMES for the `platform-data` rationale.
   'platform-data',
   'convex-data',
   'caddy-data',
diff --git a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
index 83e5344940..73f0c11913 100644
--- a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
+++ b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
@@ -1,4 +1,4 @@
-import { existsSync } from 'node:fs';
+import { existsSync, readdirSync, statSync } from 'node:fs';
 import { join } from 'node:path';
 
 import { stringify } from 'yaml';
@@ -17,14 +17,18 @@ import type { ComposeConfig, ServiceConfig } from '../types';
 import { DEV_VOLUME_NAMES } from './constants';
 
 const DEV_COLOR = 'blue' as const;
-/** Project-root subdirs that `tale init` populates from embedded examples. */
-const HOST_CONFIG_DIRS = [
+/** Domain dirs that the org-first layout uses under `<projectDir>/<org>/`. */
+const HOST_DOMAIN_DIRS = [
   'agents',
   'workflows',
   'integrations',
   'branding',
   'providers',
+  'skills',
 ] as const;
+/** Org-slug regex aligned with the platform-side validator. Refuses dotfiles
+ *  and any non-org-shaped dir at the project root (`.tale`, `.git`, etc.). */
+const ORG_SLUG_RE = /^[a-z0-9][a-z0-9_-]{0,63}$/;
 
 interface DevComposeOptions {
   /** Project root, used to verify host bind-mount sources exist before
@@ -33,23 +37,59 @@ interface DevComposeOptions {
   projectDir?: string;
 }
 
-/** Return host bind-mount fragments (e.g. './agents:/app/data/agents{ro}')
- *  only for directories that actually exist on the host, with one warning
- *  per missing directory so the operator can fix it without docker emitting
- *  a confusing 'no such file or directory' error. */
+/** Discover org subdirectories (`<projectDir>/<org>/`) by enumerating the
+ *  project root. Every direct subdir whose name matches the org-slug regex
+ *  is an org. `tale init` always creates at least `default/`. */
+function findOrgDirs(projectDir: string): string[] {
+  let entries: string[];
+  try {
+    entries = readdirSync(projectDir);
+  } catch {
+    return [];
+  }
+  const orgs: string[] = [];
+  for (const name of entries) {
+    if (!ORG_SLUG_RE.test(name)) continue;
+    let stats: ReturnType<typeof statSync>;
+    try {
+      stats = statSync(join(projectDir, name));
+    } catch {
+      continue;
+    }
+    if (!stats.isDirectory()) continue;
+    orgs.push(name);
+  }
+  return orgs;
+}
+
+/** Return host bind-mount fragments for the org-first layout.
+ *
+ *  For each org `<root>/<org>/`, emits one mount per domain dir that
+ *  actually exists: `./<org>/<domain>:<containerBase>/<org>/<domain>{ro}`.
+ *  Missing per-domain dirs are skipped silently (operators don't have to
+ *  populate every domain), but a `tale init` workspace with no org dirs
+ *  at all logs a single warning. */
 function existingHostMounts(
   projectDir: string,
   containerBase: string,
   suffix = '',
 ): string[] {
+  const orgs = findOrgDirs(projectDir);
+  if (orgs.length === 0) {
+    logger.warn(
+      `No org directories found under ${projectDir}. Container will fall back to convex-data volume contents — host edits will not hot-reload.`,
+    );
+    return [];
+  }
   const mounts: string[] = [];
-  for (const dir of HOST_CONFIG_DIRS) {
-    if (existsSync(join(projectDir, dir))) {
-      mounts.push(`./${dir}:${containerBase}/${dir}${suffix}`);
-    } else {
-      logger.warn(
-        `Skipping host bind mount for ./${dir} (directory not found in project root). Container will fall back to convex-data volume contents.`,
-      );
+  for (const org of orgs) {
+    for (const domain of HOST_DOMAIN_DIRS) {
+      const src = join(projectDir, org, domain);
+      if (existsSync(src)) {
+        mounts.push(
+          `./${org}/${domain}:${containerBase}/${org}/${domain}${suffix}`,
+        );
+      }
     }
   }
   return mounts;
@@ -106,19 +146,18 @@ export function generateDevCompose(
     convex: { condition: 'service_healthy' },
   };
 
-  const providersBindMount = existsSync(join(projectDir, 'providers'))
-    ? './providers:/app/platform-config/providers:ro'
-    : null;
-
-  // RAG/crawler need convex-data:/app/platform-config:ro for non-provider
-  // config (integrations, branding, …). The providers bind mount is a more
-  // specific path and shadows just providers/ for host-edit hot reload.
+  // RAG/crawler need convex-data:/app/platform-config:ro for per-org
+  // provider config (and integrations, branding, …). The org-first
+  // layout has paths like `default/providers/foo.json`, all under one
+  // root, so the previous standalone `./providers:/app/platform-config/providers:ro`
+  // shadow is no longer needed — the per-org bind mounts below cover
+  // host-edit hot reload for every org's provider catalog.
   const rag = createRagService(config, DEV_COLOR);
   rag.container_name = `${getProjectId()}-rag`;
   rag.volumes = [
     'rag-data:/app/data',
     'convex-data:/app/platform-config:ro',
-    ...(providersBindMount ? [providersBindMount] : []),
+    ...existingHostMounts(projectDir, '/app/platform-config', ':ro'),
   ];
 
   const crawler = createCrawlerService(config, DEV_COLOR);
@@ -126,7 +165,7 @@ export function generateDevCompose(
   crawler.volumes = [
     'crawler-data:/app/data',
     'convex-data:/app/platform-config:ro',
-    ...(providersBindMount ? [providersBindMount] : []),
+    ...existingHostMounts(projectDir, '/app/platform-config', ':ro'),
   ];
 
   const proxy = createProxyService(config, hostAlias);
diff --git a/tools/cli/src/lib/migrate-config-layout/script.sh b/tools/cli/src/lib/migrate-config-layout/script.sh
index 40f2f9a850..e69cf4c1e2 100644
--- a/tools/cli/src/lib/migrate-config-layout/script.sh
+++ b/tools/cli/src/lib/migrate-config-layout/script.sh
@@ -18,7 +18,7 @@
 # image, old code paths still active). cp leaves old paths in place so
 # old code keeps reading providers correctly until the operator runs
 # `tale deploy --override-all -y` to recreate convex with the new code.
-set -eo pipefail
+set -euo pipefail
 
 DRY_RUN=0
 CLEANUP_OLD=0
@@ -30,7 +30,13 @@ for arg in "$@"; do
   esac
 done
 
+# Defense in depth: `set -u` already aborts on unset $DATA, but ${VAR:?…}
+# gives a clearer error message and won't be defeated by a future `set
+# +u` somewhere downstream. Critical because some branches below build
+# absolute paths from $DATA and rm them — a silent empty would operate
+# from the container's filesystem root.
 DATA="${TALE_CONFIG_DIR:-/app/data}"
+: "${DATA:?DATA must be a non-empty absolute path}"
 APP_UID=1001
 APP_GID=1001
 
diff --git a/tools/cli/src/lib/rules/content.ts b/tools/cli/src/lib/rules/content.ts
index fd08762ad3..a9d592438e 100644
--- a/tools/cli/src/lib/rules/content.ts
+++ b/tools/cli/src/lib/rules/content.ts
@@ -1,32 +1,57 @@
 const RULES_CONTENT = `# Tale Project
 
-This is a Tale project. Edit configs in \`agents/\`, \`workflows/\`, \`integrations/\`, and \`branding/\`.
+This is a Tale project. Config is namespaced **per organization** under
+\`<org>/<domain>/\`, with \`default\` as the canonical (and only required) org
+on a fresh \`tale init\`. Multi-org deployments add sibling subtrees
+(\`acme/\`, \`globex/\`, …) with the same internal shape.
 
 ## Project structure
 
 \`\`\`
-agents/              — Agent JSON configs (one file per agent)
-workflows/           — Workflow JSON configs (organized by category subdirectories)
-integrations/        — Integration directories (config.json + connector.ts + icon.svg each)
-branding/            — Branding config (branding.json + images/)
-.tale/reference/     — Read-only implementation source code, read before creating or editing configs
+default/                    — Canonical/template org (created by 'tale init')
+  agents/                   — Agent JSON configs (one file per agent)
+  workflows/                — Workflow JSON configs (organized by category)
+  integrations/             — Integration bundles (config.json + connector.ts + icon.svg)
+  branding/                 — Branding config (branding.json + images/)
+  providers/                — LLM provider configs (and *.secrets.json sidecars)
+  skills/                   — Skill bundles (per-skill subdirs)
+  retention.json            — Per-org data-retention overrides
+<other-org>/                — Same shape; one tree per registered org
+.tale/reference/            — Read-only implementation source code (read before
+                              creating or editing configs)
 \`\`\`
 
 ## Working with configs
 
-Before creating or editing any config, read the relevant schemas and implementation code in \`.tale/reference/\` to understand the valid structure, fields, and constraints. Use existing config files in the project as examples.
+Before creating or editing any config, read the relevant schemas and
+implementation code in \`.tale/reference/\` to understand the valid
+structure, fields, and constraints. Use existing config files in the
+project as examples.
 
 ## How modules connect
 
-- Agents can simultaneously bind integrations (\`integrationBindings\`), delegate to other agents (\`delegates\`), and attach workflows (\`workflows\`)
-- Workflows use integration operations within their steps and can be triggered by agents
-- Check existing configs to understand available bindings before creating new ones
+- Agents can simultaneously bind integrations (\`integrationBindings\`),
+  delegate to other agents (\`delegates\`), and attach workflows
+  (\`workflows\`)
+- Workflows use integration operations within their steps and can be
+  triggered by agents
+- Check existing configs to understand available bindings before creating
+  new ones
 
 ## Naming conventions
 
+- Org slug (top-level directory name): \`[a-z0-9][a-z0-9_-]{0,63}\` (or
+  the literal \`default\`)
 - Agent filenames: \`[a-z0-9][a-z0-9_-]*\\.json\`
 - Workflow step slugs: \`[a-z0-9][a-z0-9_-]*\`
 - Integration directory names: lowercase alphanumeric with hyphens/underscores
+
+## Secrets
+
+\`*.secrets.json\` sidecars (e.g. \`providers/openrouter.secrets.json\`)
+are SOPS-encrypted and gitignored. Never commit them; never include them
+in PR diffs. The repo's root \`.gitignore\` covers \`**/*.secrets.json\`
+and \`**/.history/\` at all depths.
 `;
 
 export function buildRulesContent(): string {
diff --git a/tools/cli/src/lib/rules/generators.ts b/tools/cli/src/lib/rules/generators.ts
index 377ff7734c..77af35c73a 100644
--- a/tools/cli/src/lib/rules/generators.ts
+++ b/tools/cli/src/lib/rules/generators.ts
@@ -6,10 +6,13 @@ interface RulesFile {
 }
 
 function buildCursorMdc(content: string): string {
+  // Globs match the org-first layout: any direct subdir of the project
+  // root that contains the canonical domain dirs. Covers `default/` and
+  // any additional org subtree (`acme/`, etc.) without listing each.
   const frontmatter = [
     '---',
     'description: Tale project configuration rules',
-    'globs: agents/**,workflows/**,integrations/**,branding/**',
+    'globs: */agents/**,*/workflows/**,*/integrations/**,*/branding/**,*/providers/**,*/skills/**,*/retention.json',
     '---',
     '',
   ].join('\n');

From 2d421ec71618bac70bf7eb444919d23657761d99 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 10:33:59 +0800
Subject: [PATCH 03/41] fix(crawler,platform,rag,cli): close P0 gaps from
 org-aware review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two-round review of the org-first refactor surfaced 10 P0 issues where
the org-aware line only finished header propagation and left the data
layer, error layer, reserved-name layer, and caller-update layer
incomplete. This commit closes all 10 without expanding scope to P1.

Crawler
- Add `website_org_memberships` junction table + migration with
  backfill to `default`. websites/website_urls/chunks/page_paragraph_hashes
  remain deployment-shared content; the per-org boundary lives in
  the new table. Delete is ref-counted — last membership purges the
  website, others only drop the row.
- `_fts_search` / `_vector_search` filter by EXISTS on the membership
  table so org A can't see chunks from a domain only org B added.
- Scheduler binds `set_active_org()` to each website's oldest member
  per-task instead of hard-coding `default`.
- Re-add boot-time `ALTER TABLE chunks ALTER COLUMN embedding
  TYPE vector(N)` pin (resolved from default-org provider catalog);
  fail-loudly on missing provider, atomic pool rollback on failure.

Platform
- New `UpstreamHttpError` typed wrapper with status/retryable/
  safeMessage/sanitized body snippet; replace 8 raw `errorText`
  embeds across rag_action, rag_search_tool, fetch_document_*,
  upload_file_direct, delete_document.
- Reserve `'default'` slug in `beforeCreateOrganization` (with
  first-run seed exception via `betterAuth.organization` count),
  in zod refine on the create-org form, and narrow `isCallerAdmin`
  to admins of the `default` org (branding owner).
- config-watcher drops the `.endsWith('.json')` early gate so
  SKILL.md / scripts/*.py invalidate skill queries as the doc
  comment promises; per-domain extension filters inside
  `parseConfigChange`; 100ms tail-debounce per (type, org, slug)
  key prevents SSE storms during bulk migrations.
- Thread `organizationId` + `x-tale-org` header through 10+ crawler
  callers that previously hit the now-globally-required dependency
  without the header: fetch_and_extract, websites/internal_actions
  (8 sites), file_metadata/internal_actions, apply/extract_docx,
  generate_docx, generate_document, crawler_action (3 sites).

CLI
- deploy.ts: move success log after sync + reseed; legacy-flat-layout
  detection now throws with `tale migrate config-layout` guidance
  and runs at deploy entry (not just under --override); --override-all
  implies forceRecreate so the reseed targets the new binary.
- start.ts: pass `projectDir` to `generateDevCompose` so running from
  a subdirectory finds the right org dirs.

RAG
- /config no longer 500s — drop the per-org `get_llm_config()` call
  that the multi-org refactor made impossible from an org-less health
  endpoint; remove the corresponding fields from ConfigResponse.

Misc
- Remove Phase 2 (renameOrgSlug) from the dated convex-data migration
  script — the underlying Convex function was deleted on the parent
  refactor and any re-run would fail.

Tests: new `test_website_membership.py` (9 cases), updated
`test_websites_router.py` for ref-counted delete + first-membership
trigger semantics, `test_database.py` rewritten around the new pin
contract, new `upstream_http_error.test.ts` for the typed wrapper.
Adjusted 4 existing platform test files whose assertions hard-coded
the old raw-body error message format. `bun run check` green
(70932 platform + 481 crawler tests + RAG suite).
---
 scripts/2026-03-28-migrate-convex-data.sh     |  64 +----
 services/crawler/app/routers/pages.py         |  21 ++
 services/crawler/app/routers/websites.py      |  57 +++--
 services/crawler/app/services/database.py     |  60 +++--
 .../crawler/app/services/pg_website_store.py  | 150 ++++++++++--
 services/crawler/app/services/scheduler.py    |  23 +-
 .../crawler/app/services/search_service.py    |  82 +++++--
 ...0528000000_add_website_org_memberships.sql |  36 +++
 services/crawler/tests/conftest.py            |  30 ++-
 services/crawler/tests/test_database.py       | 134 ++++------
 .../crawler/tests/test_website_membership.py  | 156 ++++++++++++
 .../crawler/tests/test_websites_router.py     | 229 ++++++------------
 .../components/organization-form.tsx          |  12 +
 services/platform/convex/_generated/api.d.ts  |   2 +
 .../documents/document_retrieve_tool.test.ts  |   2 +-
 .../fetch_document_comparison.test.ts         |   2 +-
 .../documents/fetch_document_content.test.ts  |   4 +-
 .../helpers/fetch_document_comparison.ts      |  15 +-
 .../helpers/fetch_document_content.ts         |   5 +-
 .../convex/agent_tools/rag/rag_search_tool.ts |  22 +-
 .../web/helpers/fetch_and_extract.ts          |  11 +-
 services/platform/convex/auth.ts              |  22 ++
 .../convex/branding/internal_queries.ts       |  26 +-
 .../convex/documents/generate_document.ts     |   7 +-
 .../convex/documents/generate_docx.ts         |   7 +-
 .../convex/file_metadata/internal_actions.ts  |   5 +
 .../file_metadata/internal_mutations.ts       |   1 +
 .../convex/file_metadata/mutations.ts         |   1 +
 .../__tests__/upstream_http_error.test.ts     |  95 ++++++++
 .../convex/lib/errors/upstream_http_error.ts  | 108 +++++++++
 services/platform/convex/websites/actions.ts  |  38 ++-
 .../convex/websites/internal_actions.ts       |  70 +++++-
 services/platform/convex/websites/rest_api.ts |   4 +
 .../action_defs/crawler/crawler_action.ts     |  30 ++-
 .../action_defs/document/document_action.ts   |  16 +-
 .../document/helpers/apply_docx_structured.ts |  32 +--
 .../helpers/extract_docx_structured.ts        |   4 +
 .../rag/helpers/delete_document.ts            |   8 +-
 .../rag/helpers/upload_file_direct.test.ts    |  25 +-
 .../rag/helpers/upload_file_direct.ts         |   5 +-
 .../action_defs/rag/rag_action.ts             |   8 +-
 services/platform/lib/config-watcher.ts       |  64 ++++-
 .../shared/constants/reserved-org-slugs.ts    |  19 ++
 services/rag/app/models.py                    |   9 +-
 services/rag/app/routers/health.py            |  15 +-
 tools/cli/src/commands/deploy/index.ts        |   8 +-
 tools/cli/src/lib/actions/deploy.ts           |  38 ++-
 tools/cli/src/lib/actions/start.ts            |   1 +
 48 files changed, 1280 insertions(+), 503 deletions(-)
 create mode 100644 services/crawler/migrations/20260528000000_add_website_org_memberships.sql
 create mode 100644 services/crawler/tests/test_website_membership.py
 create mode 100644 services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
 create mode 100644 services/platform/convex/lib/errors/upstream_http_error.ts
 create mode 100644 services/platform/lib/shared/constants/reserved-org-slugs.ts

diff --git a/scripts/2026-03-28-migrate-convex-data.sh b/scripts/2026-03-28-migrate-convex-data.sh
index 09984260ec..7ad741cf29 100755
--- a/scripts/2026-03-28-migrate-convex-data.sh
+++ b/scripts/2026-03-28-migrate-convex-data.sh
@@ -2,18 +2,22 @@
 # ============================================================================
 # Migration: Convex data migration (2026-03-28)
 # ============================================================================
-# Handles two tasks:
-#   1. Copy Convex storage data from old volume to new volume
-#   2. Rename organization slug to "default"
+# Copies Convex storage data from old volume to new volume.
 #
 # Background:
 #   The platform volume was renamed from platform-convex-data to platform-data.
 #   Old Convex storage files (modules, user uploads) need to be copied to the
 #   new volume so the Convex backend can find them.
 #
+# Note:
+#   A prior version of this script also called `convex run
+#   migrations/rename_org_slug:renameOrgSlug` (Phase 2) — that migration was
+#   removed in v1.0 along with the upgrade framework; the Phase 2 step is
+#   no longer needed and would now fail with "function not found".
+#
 # Prerequisites:
 #   - Docker must be running
-#   - Platform container should be stopped for phase 1
+#   - Platform container should be stopped before running
 #
 # Usage:
 #   ./scripts/2026-03-28-migrate-convex-data.sh
@@ -83,56 +87,8 @@ else
   echo ""
 fi
 
-# ============================================================================
-# Phase 2: Rename organization slug to "default"
-# ============================================================================
-
-find_platform_container() {
-  docker ps --filter "name=tale-platform" --filter "status=running" --format '{{.Names}}' | head -1
-}
-
-echo ""
-echo "── Phase 2: Organization slug rename ──"
-echo ""
-
-container=$(find_platform_container)
-
-if [ -z "$container" ]; then
-  echo "❌ Platform container is not running."
-  echo "   Please start it first:"
-  echo ""
-  echo "     docker compose up --build -d platform"
-  echo ""
-  echo "   Then re-run this script."
-  exit 1
-fi
-
-status=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "unknown")
-if [ "$status" != "healthy" ]; then
-  echo "❌ Platform container '$container' is not healthy (status: $status)."
-  echo "   Wait for it to become healthy, then re-run this script."
-  exit 1
-fi
-
-echo "   ✅ $container is healthy."
-echo "   Running organization slug migration..."
-
-docker exec "$container" bash -c '
-  source /app/env.sh
-  env_normalize_common
-  source /app/generate-admin-key.sh
-  ensure_instance_secret
-  ADMIN_KEY=$(generate_key "$INSTANCE_NAME" "$INSTANCE_SECRET")
-  cd /app
-  HOME=/home/tanstack bunx convex run \
-    migrations/rename_org_slug:renameOrgSlug \
-    --url "http://localhost:3210" \
-    --admin-key "$ADMIN_KEY" \
-    --no-push 2>&1
-' | grep -v "^Admin key\|^📋\|^✅ Admin\|^━\|^🌐\|^$\|Steps:\|Open\|Enter\|Paste"
+# Phase 2 (renameOrgSlug) removed in v1.0 — the underlying Convex migration
+# function no longer exists in the platform codebase.
 
 echo ""
 echo "✅ Migration complete!"
-echo ""
-echo "You can verify the organization slug with:"
-echo "  docker exec $container bash -c 'source /app/env.sh && env_normalize_common && source /app/generate-admin-key.sh && ensure_instance_secret && ADMIN_KEY=\$(generate_key \"\$INSTANCE_NAME\" \"\$INSTANCE_SECRET\") && cd /app && HOME=/home/tanstack bunx convex data --component betterAuth organization --url \"http://localhost:3210\" --admin-key \"\$ADMIN_KEY\"'"
diff --git a/services/crawler/app/routers/pages.py b/services/crawler/app/routers/pages.py
index 378e8cb692..6268cedaae 100644
--- a/services/crawler/app/routers/pages.py
+++ b/services/crawler/app/routers/pages.py
@@ -6,8 +6,27 @@
 from loguru import logger
 
 from app.models import PageChunkItem, PageChunksResponse, PageListItem, PageListResponse
+from app.org_context import get_active_org
 from app.services.database import get_pool
 
+
+async def _require_org_membership(pool, domain: str, org_slug: str) -> None:
+    """Caller's org must have a membership on `domain`, else 404.
+
+    Routers below operate on shared chunks/website_urls tables — without
+    this gate, any authenticated request would be able to read any
+    domain's pages just by knowing the name.
+    """
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT 1 FROM website_org_memberships WHERE domain = $1 AND org_slug = $2",
+            domain,
+            org_slug,
+        )
+    if row is None:
+        raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
+
+
 router = APIRouter(prefix="/api/v1/pages", tags=["Pages"])
 
 
@@ -22,6 +41,7 @@ async def list_pages(
     """List all crawled pages for a website with indexing status."""
     try:
         pool = get_pool()
+        await _require_org_membership(pool, domain, get_active_org())
 
         sort_columns = {
             "last_crawled_at": "wu.last_crawled_at",
@@ -104,6 +124,7 @@ async def get_page_chunks(
     """Get all indexed chunks for a specific page URL."""
     try:
         pool = get_pool()
+        await _require_org_membership(pool, domain, get_active_org())
 
         async with pool.acquire() as conn:
             rows = await conn.fetch(
diff --git a/services/crawler/app/routers/websites.py b/services/crawler/app/routers/websites.py
index 6a8bf4d0a7..a51ea850a9 100644
--- a/services/crawler/app/routers/websites.py
+++ b/services/crawler/app/routers/websites.py
@@ -16,6 +16,7 @@
     WebsiteUrl,
     WebsiteUrlsResponse,
 )
+from app.org_context import get_active_org
 from app.services.pg_website_store import PgWebsiteStoreManager
 from app.services.scheduler import cancel_scan, trigger_scan
 
@@ -74,6 +75,7 @@ def _format_timestamp(val) -> str | None:
 async def register_website(request: RegisterWebsiteRequest, http_request: Request):
     try:
         manager = _get_manager(http_request)
+        org_slug = get_active_org()
 
         # Reject registration if domain is currently being deleted
         website = await manager.get_website(request.domain)
@@ -83,20 +85,21 @@ async def register_website(request: RegisterWebsiteRequest, http_request: Reques
                 detail=f"Domain {request.domain} is currently being deleted. Please retry later.",
             )
 
-        await manager.register_website(
+        result = await manager.register_website(
             domain=request.domain,
             scan_interval=request.scan_interval,
+            org_slug=org_slug,
         )
 
-        # Wake the scheduler — newly registered sites have last_scanned_at=NULL
-        # and will be picked up immediately by get_due_websites().
-        # The scheduler handles URL discovery, crawling, and metadata extraction
-        # with proper concurrency control (max_concurrent_scans semaphore).
-        trigger_scan()
+        # Wake the scheduler only when this membership creates new work
+        # (first org to register this domain). Subsequent orgs joining an
+        # already-tracked domain reuse the existing crawl cadence.
+        if result.get("first_membership"):
+            trigger_scan()
 
         return WebsiteInfoResponse(
             domain=request.domain,
-            status="scanning",
+            status="scanning" if result.get("first_membership") else (website.get("status") if website else "idle"),
             scan_interval=request.scan_interval,
         )
     except HTTPException:
@@ -110,6 +113,11 @@ async def register_website(request: RegisterWebsiteRequest, http_request: Reques
 async def update_website(domain: str, request: UpdateWebsiteRequest, http_request: Request):
     try:
         manager = _get_manager(http_request)
+        org_slug = get_active_org()
+        # Caller's org must have a membership on this domain or it doesn't
+        # exist (from their viewpoint).
+        if not await manager.org_has_membership(domain, org_slug):
+            raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
         website = await manager.get_website(domain)
         if not website:
             raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
@@ -145,6 +153,9 @@ async def update_website(domain: str, request: UpdateWebsiteRequest, http_reques
 async def get_website_info(domain: str, http_request: Request):
     try:
         manager = _get_manager(http_request)
+        org_slug = get_active_org()
+        if not await manager.org_has_membership(domain, org_slug):
+            raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
         website = await manager.get_website(domain)
 
         if not website:
@@ -173,19 +184,28 @@ async def get_website_info(domain: str, http_request: Request):
 @router.delete("/{domain}")
 async def deregister_website(domain: str, http_request: Request):
     try:
-        cancel_scan(domain)
         manager = _get_manager(http_request)
-        marked = await manager.begin_delete(domain)
-        if not marked:
-            # Already deleting — return 202 idempotently
-            website = await manager.get_website(domain)
-            if website and website.get("status") == "deleting":
-                return JSONResponse(
-                    status_code=202,
-                    content={"domain": domain, "status": "deleting"},
-                )
+        org_slug = get_active_org()
+
+        result = await manager.begin_delete(domain, org_slug)
+        if not result["removed_membership"]:
+            # The caller's org wasn't tracking this domain. From their
+            # viewpoint, the website doesn't exist — return 404 instead
+            # of leaking whether another org has it.
             raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
 
+        if not result["removed_website"]:
+            # Other orgs are still using this domain; only the caller's
+            # membership was removed. Domain data stays in place.
+            return JSONResponse(
+                status_code=200,
+                content={"domain": domain, "status": "membership_removed"},
+            )
+
+        # We dropped the last membership and the website was marked for
+        # deletion. Cancel any in-flight scan and start the CASCADE in
+        # the background.
+        cancel_scan(domain)
         _spawn_delete_task(manager, domain)
         return JSONResponse(
             status_code=202,
@@ -208,6 +228,9 @@ async def get_website_urls(
 ):
     try:
         manager = _get_manager(http_request)
+        org_slug = get_active_org()
+        if not await manager.org_has_membership(domain, org_slug):
+            raise HTTPException(status_code=404, detail=f"Website not found: {domain}")
         website = await manager.get_website(domain)
 
         if not website:
diff --git a/services/crawler/app/services/database.py b/services/crawler/app/services/database.py
index 160a536a5d..6a00853bfd 100644
--- a/services/crawler/app/services/database.py
+++ b/services/crawler/app/services/database.py
@@ -46,8 +46,28 @@ async def init_pool(*, max_size: int = 10) -> asyncpg.Pool:
         if _pool is not None:
             return _pool
 
+        # Resolve the deployment-wide embedding dim BEFORE creating the
+        # pool. This way, a missing `default` org provider fails fast
+        # with no pool resource to clean up — and the module-level
+        # `_pool` stays None so a follow-up retry can re-enter cleanly.
+        #
+        # Background: the baseline migration declares `embedding vector`
+        # (no dim) so pgvector accepts mixed-dim inserts silently and
+        # `create_chunks_hnsw_index()` raises ("has no dimensions").
+        # All orgs on this deployment must share embedding dims (single
+        # chunks table); we pin from the `default` org's catalog.
+        try:
+            _, _, _, dims = settings.get_embedding_config("default")
+        except Exception as e:
+            raise RuntimeError(
+                "Cannot resolve embedding dims for the 'default' org "
+                "(needed to pin public_web.chunks.embedding at boot). "
+                "Ensure providers are configured for the default org "
+                "before starting crawler."
+            ) from e
+
         dsn = _get_database_url()
-        _pool = await asyncpg.create_pool(
+        pool = await asyncpg.create_pool(
             dsn,
             min_size=min(2, max_size),
             max_size=max_size,
@@ -62,26 +82,26 @@ async def init_pool(*, max_size: int = 10) -> asyncpg.Pool:
         )
         logger.info(f"PostgreSQL connection pool initialized (min={min(2, max_size)}, max={max_size})")
 
-        # Note: the previous boot-time embedding-dimension guard was
-        # removed when crawler became multi-org. Dim is now an attribute
-        # of the org's provider catalog, not a global setting, and there
-        # is no org context at lifespan start. `get_embedding_service()`
-        # refuses dim changes per-org at request time; pgvector enforces
-        # column dim on insert.
-        #
-        # The column type and HNSW index are pinned lazily on the first
-        # insert (pgvector errors loudly on dim mismatch). All orgs
-        # sharing this crawler instance must agree on embedding dims.
-
-        # Create HNSW index if it doesn't exist yet. The index targets
-        # whatever the column type is set to; if no rows have been
-        # inserted, the call is cheap.
         try:
-            async with acquire_with_retry(_pool) as conn:
-                await conn.execute(f"SELECT {SCHEMA}.create_chunks_hnsw_index()")
-        except Exception as e:
-            logger.warning(f"HNSW index creation deferred: {e}")
-
+            async with acquire_with_retry(pool) as conn:
+                await conn.execute(f"ALTER TABLE {SCHEMA}.chunks ALTER COLUMN embedding TYPE vector({dims})")
+            logger.info(f"Pinned {SCHEMA}.chunks.embedding to vector({dims})")
+
+            # Create HNSW index if it doesn't exist yet. After the pin
+            # above this is the normal path; the function raises if the
+            # dim is still unset, which would now indicate a deeper
+            # invariant break.
+            try:
+                async with acquire_with_retry(pool) as conn:
+                    await conn.execute(f"SELECT {SCHEMA}.create_chunks_hnsw_index()")
+            except Exception as e:
+                logger.warning(f"HNSW index creation deferred: {e}")
+        except Exception:
+            # Roll back the pool we just opened so a retry hits a clean state.
+            await pool.close()
+            raise
+
+        _pool = pool
         return _pool
 
 
diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py
index b47d1b83ad..75ee0e8ded 100644
--- a/services/crawler/app/services/pg_website_store.py
+++ b/services/crawler/app/services/pg_website_store.py
@@ -253,8 +253,25 @@ def __init__(self, pool: asyncpg.Pool):
         self._pool = pool
         self._stores: dict[str, PgWebsiteStore] = {}
 
-    async def register_website(self, domain: str, scan_interval: int = 21600) -> dict:
-        async with acquire_with_retry(self._pool) as conn:
+    async def register_website(
+        self,
+        domain: str,
+        scan_interval: int = 21600,
+        *,
+        org_slug: str,
+    ) -> dict:
+        """Register a domain on behalf of `org_slug`.
+
+        websites is deployment-shared content storage; the per-org
+        boundary lives in `website_org_memberships`. The first org to
+        register a domain creates the website row; subsequent orgs
+        simply join the membership table without re-fetching.
+
+        Returns a dict that includes `first_membership=True` only when
+        this call is the first to register the domain — callers use it
+        to decide whether to trigger an immediate scan.
+        """
+        async with acquire_with_retry(self._pool) as conn, conn.transaction():
             await conn.execute(
                 """INSERT INTO websites (domain, scan_interval, created_at, updated_at)
                    VALUES ($1, $2, NOW(), NOW())
@@ -264,8 +281,37 @@ async def register_website(self, domain: str, scan_interval: int = 21600) -> dic
                 domain,
                 scan_interval,
             )
-        logger.info(f"Registered website: {domain} (interval={scan_interval}s)")
-        return {"domain": domain, "scan_interval": scan_interval, "status": "idle"}
+            # ON CONFLICT DO NOTHING — re-registering from the same org is a no-op.
+            # `xmax = 0` is true on a row INSERTed in this command; non-zero on
+            # an existing row that hit ON CONFLICT. We use it to tell the caller
+            # whether this was the very first membership for the domain.
+            row = await conn.fetchrow(
+                """INSERT INTO website_org_memberships (domain, org_slug)
+                   VALUES ($1, $2)
+                   ON CONFLICT DO NOTHING
+                   RETURNING (xmax = 0) AS inserted""",
+                domain,
+                org_slug,
+            )
+            membership_inserted = bool(row and row["inserted"])
+            total_members = await conn.fetchval(
+                "SELECT COUNT(*) FROM website_org_memberships WHERE domain = $1",
+                domain,
+            )
+        first_membership = membership_inserted and total_members == 1
+        logger.info(
+            "Registered website: %s for org=%s (interval=%ss, first_membership=%s)",
+            domain,
+            org_slug,
+            scan_interval,
+            first_membership,
+        )
+        return {
+            "domain": domain,
+            "scan_interval": scan_interval,
+            "status": "idle",
+            "first_membership": first_membership,
+        }
 
     async def update_website_metadata(
         self,
@@ -288,16 +334,50 @@ async def update_website_metadata(
                 page_count,
             )
 
-    async def begin_delete(self, domain: str) -> bool:
-        """Mark a website for deletion. Returns True if the domain was found and marked."""
-        self._stores.pop(domain, None)
-        async with acquire_with_retry(self._pool) as conn:
-            row = await conn.fetchrow(
-                "UPDATE websites SET status = 'deleting', updated_at = NOW() "
-                "WHERE domain = $1 AND status != 'deleting' RETURNING domain",
+    async def begin_delete(self, domain: str, org_slug: str) -> dict:
+        """Remove org's membership of `domain`. If no orgs remain after
+        the removal, mark the website itself for deletion (the actual
+        CASCADE happens in `execute_delete`, called from a background
+        task).
+
+        Returns a dict with:
+          - `removed_membership`: True if the (domain, org) row existed
+            and was removed.
+          - `removed_website`: True if this caller dropped the last
+            membership and the website was marked for deletion.
+        """
+        async with acquire_with_retry(self._pool) as conn, conn.transaction():
+            deleted = await conn.execute(
+                "DELETE FROM website_org_memberships WHERE domain = $1 AND org_slug = $2",
                 domain,
+                org_slug,
             )
-            return row is not None
+            # asyncpg returns "DELETE N" as the tag; "DELETE 0" means no row matched.
+            removed_membership = deleted != "DELETE 0"
+            remaining = await conn.fetchval(
+                "SELECT COUNT(*) FROM website_org_memberships WHERE domain = $1",
+                domain,
+            )
+            removed_website = False
+            if remaining == 0:
+                self._stores.pop(domain, None)
+                row = await conn.fetchrow(
+                    "UPDATE websites SET status = 'deleting', updated_at = NOW() "
+                    "WHERE domain = $1 AND status != 'deleting' RETURNING domain",
+                    domain,
+                )
+                removed_website = row is not None
+        logger.info(
+            "begin_delete: domain=%s org=%s removed_membership=%s removed_website=%s",
+            domain,
+            org_slug,
+            removed_membership,
+            removed_website,
+        )
+        return {
+            "removed_membership": removed_membership,
+            "removed_website": removed_website,
+        }
 
     async def execute_delete(self, domain: str) -> None:
         """Run the actual CASCADE DELETE. Intended for background execution."""
@@ -320,19 +400,51 @@ async def get_due_websites(self) -> list[dict]:
         - Its scan interval has elapsed and it is not currently scanning/deleting, OR
         - It has been stuck in 'scanning' for >2 hours (no heartbeat progress),
           indicating the previous scanner crashed or was replaced.
+
+        Each returned row also includes `owner_org_slug` — the slug of the
+        org that registered the domain earliest. The scheduler uses this to
+        bind `set_active_org()` so the per-org provider catalog can resolve
+        API keys for the embed/fetch path. Domains with no remaining
+        memberships (a transient race during delete) are skipped.
         """
         async with acquire_with_retry(self._pool) as conn:
             rows = await conn.fetch(
-                """SELECT domain, status, scan_interval, last_scanned_at, error
-                   FROM websites
-                   WHERE (status NOT IN ('scanning', 'deleting')
-                          AND (last_scanned_at IS NULL
-                               OR last_scanned_at + make_interval(secs => scan_interval) < NOW()))
-                      OR (status = 'scanning'
-                          AND updated_at < NOW() - INTERVAL '2 hours')"""
+                """SELECT w.domain, w.status, w.scan_interval, w.last_scanned_at, w.error,
+                          m.org_slug AS owner_org_slug
+                   FROM websites w
+                   JOIN LATERAL (
+                       SELECT org_slug FROM website_org_memberships
+                       WHERE domain = w.domain
+                       ORDER BY added_at ASC, org_slug ASC
+                       LIMIT 1
+                   ) m ON true
+                   WHERE (w.status NOT IN ('scanning', 'deleting')
+                          AND (w.last_scanned_at IS NULL
+                               OR w.last_scanned_at + make_interval(secs => w.scan_interval) < NOW()))
+                      OR (w.status = 'scanning'
+                          AND w.updated_at < NOW() - INTERVAL '2 hours')"""
             )
             return [dict(r) for r in rows]
 
+    async def org_has_membership(self, domain: str, org_slug: str) -> bool:
+        """True if `org_slug` has registered `domain` (used by per-org views)."""
+        async with acquire_with_retry(self._pool) as conn:
+            row = await conn.fetchrow(
+                "SELECT 1 FROM website_org_memberships WHERE domain = $1 AND org_slug = $2",
+                domain,
+                org_slug,
+            )
+            return row is not None
+
+    async def list_domains_for_org(self, org_slug: str) -> list[str]:
+        """Return all domains the given org has registered."""
+        async with acquire_with_retry(self._pool) as conn:
+            rows = await conn.fetch(
+                "SELECT domain FROM website_org_memberships WHERE org_slug = $1 ORDER BY domain",
+                org_slug,
+            )
+            return [r["domain"] for r in rows]
+
     async def update_scan_interval(self, domain: str, scan_interval: int) -> None:
         async with acquire_with_retry(self._pool) as conn:
             await conn.execute(
diff --git a/services/crawler/app/services/scheduler.py b/services/crawler/app/services/scheduler.py
index d72051d73b..edc83742fb 100644
--- a/services/crawler/app/services/scheduler.py
+++ b/services/crawler/app/services/scheduler.py
@@ -12,6 +12,7 @@
 
 import httpx
 
+from app.org_context import set_active_org
 from app.services.crawler_service import CrawlerService
 from app.services.indexing_service import IndexingService
 from app.services.pg_website_store import PgWebsiteStore, PgWebsiteStoreManager
@@ -66,21 +67,15 @@ async def run_scheduler(
     global _scan_trigger
     _scan_trigger = asyncio.Event()
 
-    # Background scheduler has no per-request X-Tale-Org context. Until
-    # the websites table carries the owning org slug, fall back to
-    # `default` for any provider lookups triggered by scheduled scans.
-    # Log once so operators see the assumption.
-    from app.org_context import set_active_org
-
-    set_active_org("default")
-    logger.warning(
-        "Scheduler background task using org slug 'default' for provider "
-        "lookups. Per-website org binding is a follow-up."
-    )
-
     sem = asyncio.Semaphore(max_concurrent_scans)
 
-    async def bounded_scan(domain: str):
+    async def bounded_scan(domain: str, owner_org_slug: str):
+        # ContextVar is per-asyncio-task: asyncio.create_task copies the
+        # parent context at spawn, then any `set` inside the task only
+        # affects this task. Setting here binds provider lookups for the
+        # embed/fetch path to the website's owning org for the duration
+        # of the scan.
+        set_active_org(owner_org_slug)
         async with sem:
             await _scan_website(
                 domain,
@@ -95,7 +90,7 @@ async def bounded_scan(domain: str):
             due = await store_manager.get_due_websites()
             if due:
                 logger.info(f"Scheduler: {len(due)} website(s) due for scanning")
-                tasks = [asyncio.create_task(bounded_scan(w["domain"])) for w in due]
+                tasks = [asyncio.create_task(bounded_scan(w["domain"], w["owner_org_slug"])) for w in due]
                 results = await asyncio.gather(*tasks, return_exceptions=True)
                 for website, result in zip(due, results, strict=True):
                     if isinstance(result, BaseException):
diff --git a/services/crawler/app/services/search_service.py b/services/crawler/app/services/search_service.py
index 99bd9a2e55..9526afcdb7 100644
--- a/services/crawler/app/services/search_service.py
+++ b/services/crawler/app/services/search_service.py
@@ -9,6 +9,7 @@
 
 import asyncpg
 
+from app.org_context import get_active_org
 from app.services.database import acquire_with_retry
 from app.services.embedding_service import get_embedding_service
 
@@ -41,13 +42,18 @@ async def search(
         limit: int = 10,
         similarity_threshold: float = 0.4,
     ) -> list[SearchResult]:
+        # Resolve the active org once and pass to both helpers — chunks
+        # data is shared across orgs, but each search is restricted to
+        # domains the caller's org has registered (membership filter).
+        org_slug = get_active_org()
+
         # Generate query embedding and run both searches in parallel
         embedding_task = asyncio.create_task(get_embedding_service().embed_query(query))
-        fts_task = asyncio.create_task(self._fts_search(query, domain, limit * 3))
+        fts_task = asyncio.create_task(self._fts_search(query, org_slug, domain, limit * 3))
 
         query_embedding = await embedding_task
         fts_results = await fts_task
-        vector_results = await self._vector_search(query_embedding, domain, limit * 3)
+        vector_results = await self._vector_search(query_embedding, org_slug, domain, limit * 3)
 
         # Pre-filter vector results by cosine similarity (matches RAG pipeline).
         # If ALL vector results fall below the threshold the query is considered
@@ -69,57 +75,81 @@ async def search(
 
         return self._merge_rrf([fts_results, vector_results], limit)
 
-    async def _fts_search(self, query: str, domain: str | None, limit: int) -> list[dict]:
+    async def _fts_search(self, query: str, org_slug: str, domain: str | None, limit: int) -> list[dict]:
+        # Membership filter restricts the org's view to domains it has
+        # registered. chunks/websites are deployment-shared content, but
+        # org A must not see search hits from a domain only org B added.
         async with acquire_with_retry(self._pool) as conn:
             if domain:
                 rows = await conn.fetch(
-                    """SELECT id, url, title, chunk_content, core_content, chunk_index,
-                              paradedb.score(id) AS score
-                       FROM chunks
-                       WHERE id @@@ paradedb.match('chunk_content', $1) AND domain = $2
+                    """SELECT c.id, c.url, c.title, c.chunk_content, c.core_content, c.chunk_index,
+                              paradedb.score(c.id) AS score
+                       FROM chunks c
+                       WHERE c.id @@@ paradedb.match('chunk_content', $1)
+                         AND c.domain = $2
+                         AND EXISTS (
+                             SELECT 1 FROM website_org_memberships m
+                             WHERE m.domain = c.domain AND m.org_slug = $3
+                         )
                        ORDER BY score DESC
-                       LIMIT $3""",
+                       LIMIT $4""",
                     query,
                     domain,
+                    org_slug,
                     limit,
                 )
             else:
                 rows = await conn.fetch(
-                    """SELECT id, url, title, chunk_content, core_content, chunk_index,
-                              paradedb.score(id) AS score
-                       FROM chunks
-                       WHERE id @@@ paradedb.match('chunk_content', $1)
+                    """SELECT c.id, c.url, c.title, c.chunk_content, c.core_content, c.chunk_index,
+                              paradedb.score(c.id) AS score
+                       FROM chunks c
+                       WHERE c.id @@@ paradedb.match('chunk_content', $1)
+                         AND EXISTS (
+                             SELECT 1 FROM website_org_memberships m
+                             WHERE m.domain = c.domain AND m.org_slug = $2
+                         )
                        ORDER BY score DESC
-                       LIMIT $2""",
+                       LIMIT $3""",
                     query,
+                    org_slug,
                     limit,
                 )
             return [dict(r) for r in rows]
 
-    async def _vector_search(self, embedding: list[float], domain: str | None, limit: int) -> list[dict]:
+    async def _vector_search(self, embedding: list[float], org_slug: str, domain: str | None, limit: int) -> list[dict]:
         vec_str = json.dumps(embedding)
         async with acquire_with_retry(self._pool) as conn:
             if domain:
                 rows = await conn.fetch(
-                    """SELECT id, url, title, chunk_content, core_content, chunk_index,
-                              1 - (embedding <=> $1::vector) AS score
-                       FROM chunks
-                       WHERE domain = $2 AND embedding IS NOT NULL
-                       ORDER BY embedding <=> $1::vector
-                       LIMIT $3""",
+                    """SELECT c.id, c.url, c.title, c.chunk_content, c.core_content, c.chunk_index,
+                              1 - (c.embedding <=> $1::vector) AS score
+                       FROM chunks c
+                       WHERE c.domain = $2 AND c.embedding IS NOT NULL
+                         AND EXISTS (
+                             SELECT 1 FROM website_org_memberships m
+                             WHERE m.domain = c.domain AND m.org_slug = $3
+                         )
+                       ORDER BY c.embedding <=> $1::vector
+                       LIMIT $4""",
                     vec_str,
                     domain,
+                    org_slug,
                     limit,
                 )
             else:
                 rows = await conn.fetch(
-                    """SELECT id, url, title, chunk_content, core_content, chunk_index,
-                              1 - (embedding <=> $1::vector) AS score
-                       FROM chunks
-                       WHERE embedding IS NOT NULL
-                       ORDER BY embedding <=> $1::vector
-                       LIMIT $2""",
+                    """SELECT c.id, c.url, c.title, c.chunk_content, c.core_content, c.chunk_index,
+                              1 - (c.embedding <=> $1::vector) AS score
+                       FROM chunks c
+                       WHERE c.embedding IS NOT NULL
+                         AND EXISTS (
+                             SELECT 1 FROM website_org_memberships m
+                             WHERE m.domain = c.domain AND m.org_slug = $2
+                         )
+                       ORDER BY c.embedding <=> $1::vector
+                       LIMIT $3""",
                     vec_str,
+                    org_slug,
                     limit,
                 )
             return [dict(r) for r in rows]
diff --git a/services/crawler/migrations/20260528000000_add_website_org_memberships.sql b/services/crawler/migrations/20260528000000_add_website_org_memberships.sql
new file mode 100644
index 0000000000..b2047a6cb4
--- /dev/null
+++ b/services/crawler/migrations/20260528000000_add_website_org_memberships.sql
@@ -0,0 +1,36 @@
+-- migrate:up
+-- Per-org website membership layer.
+--
+-- websites / website_urls / chunks / page_paragraph_hashes remain
+-- deployment-shared content storage (one canonical fetch + embed per
+-- domain, independent of which org requested it). This junction table
+-- tracks WHICH orgs have asked the crawler to track a given domain.
+--
+-- Register: insert (domain, org_slug) ON CONFLICT DO NOTHING. First
+--   membership for a never-seen domain implies UPSERT into websites.
+-- Delete: delete the (domain, org_slug) row; the website itself is
+--   only purged when no memberships remain (ref-counted).
+-- Search/list: JOIN this table filtered by current X-Tale-Org so org A
+--   only sees domains it registered (or another member of org A did).
+--
+-- Backfill: every existing website row is treated as belonging to the
+--   'default' org, which is the only org in use at the demo stage.
+--   ON CONFLICT DO NOTHING keeps the migration idempotent on re-run.
+
+CREATE TABLE IF NOT EXISTS public_web.website_org_memberships (
+    domain   TEXT        NOT NULL REFERENCES public_web.websites(domain) ON DELETE CASCADE,
+    org_slug TEXT        NOT NULL,
+    added_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    PRIMARY KEY (domain, org_slug)
+);
+
+CREATE INDEX IF NOT EXISTS idx_website_org_memberships_by_org
+    ON public_web.website_org_memberships (org_slug);
+
+INSERT INTO public_web.website_org_memberships (domain, org_slug)
+SELECT domain, 'default'
+FROM public_web.websites
+ON CONFLICT DO NOTHING;
+
+-- migrate:down
+DROP TABLE IF EXISTS public_web.website_org_memberships;
diff --git a/services/crawler/tests/conftest.py b/services/crawler/tests/conftest.py
index 2c149549bf..55be3f34c3 100644
--- a/services/crawler/tests/conftest.py
+++ b/services/crawler/tests/conftest.py
@@ -1 +1,29 @@
-"""Test configuration for the crawler service."""
+"""Test configuration for the crawler service.
+
+Provides an autouse fixture that binds the active-org ContextVar to
+`"test-org"` for the duration of each test. Crawler routers and
+services now read `get_active_org()` to scope work per-org; without
+a binding they raise RuntimeError on first use.
+
+The same fixture also resets the ContextVar after each test to keep
+tests isolated under parallel runners.
+"""
+
+from collections.abc import Iterator
+
+import pytest
+
+from app.org_context import _active_org, set_active_org
+
+
+@pytest.fixture(autouse=True)
+def _bind_test_active_org() -> Iterator[None]:
+    """Bind `set_active_org("test-org")` for the test, then reset."""
+    token = _active_org.set("test-org")
+    try:
+        yield
+    finally:
+        _active_org.reset(token)
+
+
+__all__ = ["_bind_test_active_org", "set_active_org"]
diff --git a/services/crawler/tests/test_database.py b/services/crawler/tests/test_database.py
index 18aa21712f..763aabf6a6 100644
--- a/services/crawler/tests/test_database.py
+++ b/services/crawler/tests/test_database.py
@@ -1,4 +1,12 @@
-"""Tests for database pool initialization, including dimension mismatch guard."""
+"""Tests for database pool initialization, including the boot-time
+embedding-column dimension pin.
+
+The baseline migration declares `chunks.embedding` as bare `vector`
+(no dim). Without an explicit pin pgvector accepts mixed-dim inserts
+silently and the HNSW index can't be built. `init_pool` resolves
+the deployment-wide dim from the `default` org's provider catalog
+and `ALTER TABLE`-pins the column at boot.
+"""
 
 from contextlib import asynccontextmanager
 from unittest.mock import AsyncMock, patch
@@ -16,14 +24,9 @@ def _reset_pool():
     db_mod._pool = None
 
 
-def _fake_pool(stored_dims: int | None, col_type: str = "vector(1536)"):
-    """Build a mock asyncpg pool.
-
-    *stored_dims* is returned for the first fetchval (dimension check).
-    *col_type* is returned for the second fetchval (column type check).
-    """
+def _fake_pool():
+    """Build a mock asyncpg pool with a tracked single connection."""
     conn = AsyncMock()
-    conn.fetchval = AsyncMock(side_effect=[stored_dims, col_type])
     conn.execute = AsyncMock()
 
     pool = AsyncMock()
@@ -37,122 +40,75 @@ async def _acq(_pool, **_kw):
     return pool, _acq
 
 
-@pytest.mark.skip(
-    reason="Boot-time embedding-dimension guard was removed when crawler "
-    "became multi-org. Dim is now per-org provider catalog; pgvector enforces "
-    "column dim on insert + get_embedding_service refuses dim changes per-org."
-)
-class TestDimensionMismatchGuard:
-    @pytest.mark.asyncio
-    async def test_raises_on_dimension_mismatch(self):
-        fake_pool, acq = _fake_pool(stored_dims=3072)
-
-        with (
-            patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
-            patch("app.services.database.acquire_with_retry", acq),
-            patch("app.services.database.settings") as mock_settings,
-        ):
-            mock_settings.get_embedding_dimensions.return_value = 1536
-            mock_settings.database_url = "postgresql://test:test@localhost/test"
-
-            with pytest.raises(RuntimeError, match="dimension mismatch"):
-                await db_mod.init_pool()
-
-        assert db_mod._pool is None
-
+class TestEmbeddingColumnPin:
     @pytest.mark.asyncio
-    async def test_passes_when_dimensions_match(self):
-        fake_pool, acq = _fake_pool(stored_dims=1536, col_type="vector(1536)")
+    async def test_pins_column_at_boot(self):
+        """init_pool issues ALTER TABLE … TYPE vector(N) using default-org dim."""
+        fake_pool, acq = _fake_pool()
 
         with (
             patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
             patch("app.services.database.acquire_with_retry", acq),
             patch("app.services.database.settings") as mock_settings,
         ):
-            mock_settings.get_embedding_dimensions.return_value = 1536
-            mock_settings.database_url = "postgresql://test:test@localhost/test"
-
-            pool = await db_mod.init_pool()
-
-        assert pool is fake_pool
-
-    @pytest.mark.asyncio
-    async def test_passes_when_no_existing_data(self):
-        fake_pool, acq = _fake_pool(stored_dims=None, col_type="vector")
-
-        with (
-            patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
-            patch("app.services.database.acquire_with_retry", acq),
-            patch("app.services.database.settings") as mock_settings,
-        ):
-            mock_settings.get_embedding_dimensions.return_value = 1536
-            mock_settings.database_url = "postgresql://test:test@localhost/test"
-
-            pool = await db_mod.init_pool()
-
-        assert pool is fake_pool
-
-
-@pytest.mark.skip(
-    reason="Boot-time embedding-column ALTER was removed when crawler became "
-    "multi-org. Column type is now driven by the first INSERT under pgvector; "
-    "operators reconcile per-org provider catalogs manually if dims diverge."
-)
-class TestEmbeddingColumnPinning:
-    @pytest.mark.asyncio
-    async def test_alters_untyped_vector_column(self):
-        """When column is bare `vector`, init_pool pins it to vector(N)."""
-        fake_pool, acq = _fake_pool(stored_dims=None, col_type="vector")
-
-        with (
-            patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
-            patch("app.services.database.acquire_with_retry", acq),
-            patch("app.services.database.settings") as mock_settings,
-        ):
-            mock_settings.get_embedding_dimensions.return_value = 768
+            mock_settings.get_embedding_config.return_value = (
+                "https://api.example.com",
+                "sk-test",
+                "text-embedding-3-small",
+                1536,
+            )
             mock_settings.database_url = "postgresql://test:test@localhost/test"
 
             await db_mod.init_pool()
 
+        mock_settings.get_embedding_config.assert_called_once_with("default")
         conn = fake_pool._test_conn
         execute_calls = [str(c) for c in conn.execute.call_args_list]
-        assert any("ALTER TABLE" in c and "vector(768)" in c for c in execute_calls)
+        assert any("ALTER TABLE" in c and "vector(1536)" in c for c in execute_calls)
+        # HNSW index creation is attempted after the pin.
+        assert any("create_chunks_hnsw_index" in c for c in execute_calls)
 
     @pytest.mark.asyncio
-    async def test_skips_alter_when_already_typed(self):
-        """When column already has dimensions, no ALTER is issued."""
-        fake_pool, acq = _fake_pool(stored_dims=1536, col_type="vector(1536)")
+    async def test_uses_default_org_dim(self):
+        """ALTER TABLE uses whatever dim the default org's provider returns."""
+        fake_pool, acq = _fake_pool()
 
         with (
             patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
             patch("app.services.database.acquire_with_retry", acq),
             patch("app.services.database.settings") as mock_settings,
         ):
-            mock_settings.get_embedding_dimensions.return_value = 1536
+            mock_settings.get_embedding_config.return_value = (
+                "https://api.example.com",
+                "sk-test",
+                "nomic-embed-text",
+                768,
+            )
             mock_settings.database_url = "postgresql://test:test@localhost/test"
 
             await db_mod.init_pool()
 
         conn = fake_pool._test_conn
         execute_calls = [str(c) for c in conn.execute.call_args_list]
-        assert not any("ALTER TABLE" in c for c in execute_calls)
+        assert any("ALTER TABLE" in c and "vector(768)" in c for c in execute_calls)
 
     @pytest.mark.asyncio
-    async def test_repins_column_when_dimension_changed(self):
-        """When column is pinned to a different dimension and table is empty, re-pin."""
-        fake_pool, acq = _fake_pool(stored_dims=None, col_type="vector(2560)")
+    async def test_raises_when_default_org_provider_unconfigured(self):
+        """Without a default-org provider, boot fails loudly rather than
+        proceeding with an unpinned column (silent regression risk)."""
+        fake_pool, acq = _fake_pool()
 
         with (
             patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
             patch("app.services.database.acquire_with_retry", acq),
             patch("app.services.database.settings") as mock_settings,
         ):
-            mock_settings.get_embedding_dimensions.return_value = 1536
+            mock_settings.get_embedding_config.side_effect = ValueError(
+                "no embedding provider configured for org 'default'"
+            )
             mock_settings.database_url = "postgresql://test:test@localhost/test"
 
-            await db_mod.init_pool()
+            with pytest.raises(RuntimeError, match="default"):
+                await db_mod.init_pool()
 
-        conn = fake_pool._test_conn
-        execute_calls = [str(c) for c in conn.execute.call_args_list]
-        assert any("DROP INDEX" in c and "idx_pw_chunks_embedding_hnsw" in c for c in execute_calls)
-        assert any("ALTER TABLE" in c and "vector(1536)" in c for c in execute_calls)
+        assert db_mod._pool is None
diff --git a/services/crawler/tests/test_website_membership.py b/services/crawler/tests/test_website_membership.py
new file mode 100644
index 0000000000..c14214b53c
--- /dev/null
+++ b/services/crawler/tests/test_website_membership.py
@@ -0,0 +1,156 @@
+"""Tests for the per-org website_org_memberships layer.
+
+Covers `PgWebsiteStoreManager.register_website` / `begin_delete` /
+`get_due_websites` / `org_has_membership` against an in-memory
+asyncpg pool stand-in. The aim is to lock in the ref-counted delete
+semantics — websites/chunks rows are deployment-shared, but the
+"who can see this domain" decision is org-local.
+"""
+
+from contextlib import asynccontextmanager
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.services.pg_website_store import PgWebsiteStoreManager
+
+pytestmark = pytest.mark.asyncio
+
+
+def _make_conn(*, fetchval_return=0, execute_return="DELETE 1", fetchrow_return=None):
+    """Build a per-test asyncpg connection stub with configurable returns."""
+    conn = AsyncMock()
+    conn.execute = AsyncMock(return_value=execute_return)
+    conn.fetchval = AsyncMock(return_value=fetchval_return)
+    conn.fetchrow = AsyncMock(return_value=fetchrow_return)
+    # Transactions are no-ops at this layer; just yield the same conn.
+    conn.transaction = MagicMock()
+    conn.transaction.return_value.__aenter__ = AsyncMock(return_value=None)
+    conn.transaction.return_value.__aexit__ = AsyncMock(return_value=None)
+    return conn
+
+
+def _patch_acquire(conn):
+    """Patch `acquire_with_retry` to yield our stub connection."""
+
+    @asynccontextmanager
+    async def _acq(_pool, **_kw):
+        yield conn
+
+    return patch("app.services.pg_website_store.acquire_with_retry", _acq)
+
+
+class TestRegisterWebsite:
+    async def test_first_membership_reports_first_membership_true(self):
+        conn = _make_conn(
+            fetchval_return=1,  # total members after insert = 1
+            fetchrow_return={"inserted": True},
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.register_website(domain="example.com", scan_interval=3600, org_slug="acme")
+
+        assert result["first_membership"] is True
+        assert result["domain"] == "example.com"
+        assert result["scan_interval"] == 3600
+
+    async def test_second_org_joining_does_not_report_first_membership(self):
+        conn = _make_conn(
+            fetchval_return=2,  # total members after insert = 2
+            fetchrow_return={"inserted": True},
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.register_website(domain="example.com", scan_interval=3600, org_slug="beta")
+
+        assert result["first_membership"] is False
+
+    async def test_idempotent_when_same_org_re_registers(self):
+        # ON CONFLICT DO NOTHING → no RETURNING row, total stays as-is.
+        conn = _make_conn(
+            fetchval_return=1,
+            fetchrow_return=None,
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.register_website(domain="example.com", scan_interval=3600, org_slug="acme")
+
+        assert result["first_membership"] is False
+
+
+class TestBeginDelete:
+    async def test_removes_website_when_last_membership(self):
+        conn = _make_conn(
+            fetchval_return=0,  # no memberships left after delete
+            execute_return="DELETE 1",  # the membership row was deleted
+            fetchrow_return={"domain": "example.com"},  # website marked deleting
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.begin_delete("example.com", "acme")
+
+        assert result == {"removed_membership": True, "removed_website": True}
+
+    async def test_keeps_website_when_other_orgs_remain(self):
+        conn = _make_conn(
+            fetchval_return=2,  # 2 other orgs still tracking
+            execute_return="DELETE 1",
+            fetchrow_return=None,
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.begin_delete("example.com", "acme")
+
+        assert result == {"removed_membership": True, "removed_website": False}
+
+    async def test_no_membership_returns_false_false(self):
+        """Caller's org never tracked this domain — neither rm-membership nor rm-website fires."""
+        conn = _make_conn(
+            fetchval_return=3,
+            execute_return="DELETE 0",  # no row matched the (domain, org) tuple
+            fetchrow_return=None,
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            result = await manager.begin_delete("example.com", "ghost")
+
+        assert result == {"removed_membership": False, "removed_website": False}
+
+
+class TestOrgHasMembership:
+    async def test_returns_true_when_row_exists(self):
+        conn = _make_conn(fetchrow_return={"?column?": 1})
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            assert await manager.org_has_membership("example.com", "acme") is True
+
+    async def test_returns_false_when_row_missing(self):
+        conn = _make_conn(fetchrow_return=None)
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            assert await manager.org_has_membership("example.com", "ghost") is False
+
+
+class TestGetDueWebsites:
+    async def test_includes_owner_org_slug(self):
+        conn = _make_conn()
+        # fetch() returns rows; the test cares about shape, not SQL.
+        conn.fetch = AsyncMock(
+            return_value=[
+                {
+                    "domain": "example.com",
+                    "status": "idle",
+                    "scan_interval": 3600,
+                    "last_scanned_at": None,
+                    "error": None,
+                    "owner_org_slug": "acme",
+                }
+            ]
+        )
+        with _patch_acquire(conn):
+            manager = PgWebsiteStoreManager(pool=MagicMock())
+            due = await manager.get_due_websites()
+
+        assert len(due) == 1
+        assert due[0]["domain"] == "example.com"
+        assert due[0]["owner_org_slug"] == "acme"
diff --git a/services/crawler/tests/test_websites_router.py b/services/crawler/tests/test_websites_router.py
index 53df8804f7..91f107e03d 100644
--- a/services/crawler/tests/test_websites_router.py
+++ b/services/crawler/tests/test_websites_router.py
@@ -16,6 +16,9 @@
 def mock_manager():
     manager = AsyncMock()
     manager.get_site_store = MagicMock()
+    # Default: caller's org has membership (tests that exercise the
+    # 404-on-missing-membership path can override this).
+    manager.org_has_membership.return_value = True
     app.state.pg_store_manager = manager
     yield manager
     del app.state.pg_store_manager
@@ -40,12 +43,13 @@ def _website_row(domain="example.com", scan_interval=21600, **overrides):
 
 
 class TestRegisterWebsite:
-    async def test_success(self, mock_manager):
+    async def test_success_first_membership_triggers_scan(self, mock_manager):
         mock_manager.get_website.return_value = None
         mock_manager.register_website.return_value = {
             "domain": "example.com",
             "status": "idle",
             "scan_interval": 21600,
+            "first_membership": True,
         }
 
         with patch("app.routers.websites.trigger_scan") as mock_trigger:
@@ -59,77 +63,60 @@ async def test_success(self, mock_manager):
         data = response.json()
         assert data["domain"] == "example.com"
         assert data["status"] == "scanning"
-        assert data["scan_interval"] == 21600
         mock_manager.register_website.assert_awaited_once_with(
             domain="example.com",
             scan_interval=21600,
+            org_slug="test-org",
         )
         mock_trigger.assert_called_once()
 
-    async def test_normalizes_full_url_to_domain(self, mock_manager):
+    async def test_second_org_joining_does_not_retrigger_scan(self, mock_manager):
+        """If the domain is already tracked by another org, the new
+        membership reuses the existing crawl; trigger_scan should NOT fire."""
+        mock_manager.get_website.return_value = _website_row(status="active")
         mock_manager.register_website.return_value = {
-            "domain": "www.wisekey.com",
+            "domain": "example.com",
             "status": "idle",
             "scan_interval": 21600,
+            "first_membership": False,
         }
-        mock_manager.get_website.return_value = _website_row(domain="www.wisekey.com")
 
-        with patch("app.routers.websites.trigger_scan"):
+        with patch("app.routers.websites.trigger_scan") as mock_trigger:
             async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
                 response = await client.post(
                     "/api/v1/websites",
-                    json={"domain": "https://www.wisekey.com", "scan_interval": 21600},
+                    json={"domain": "example.com", "scan_interval": 21600},
                 )
 
         assert response.status_code == 200
-        mock_manager.register_website.assert_awaited_once_with(
-            domain="www.wisekey.com",
-            scan_interval=21600,
-        )
+        data = response.json()
+        # Status reflects the already-tracked website, not "scanning"
+        assert data["status"] == "active"
+        mock_trigger.assert_not_called()
 
-    async def test_uses_default_scan_interval(self, mock_manager):
+    async def test_normalizes_full_url_to_domain(self, mock_manager):
         mock_manager.register_website.return_value = {
-            "domain": "example.com",
+            "domain": "www.wisekey.com",
             "status": "idle",
             "scan_interval": 21600,
+            "first_membership": True,
         }
-        mock_manager.get_website.return_value = _website_row()
+        mock_manager.get_website.return_value = _website_row(domain="www.wisekey.com")
 
         with patch("app.routers.websites.trigger_scan"):
             async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
                 response = await client.post(
                     "/api/v1/websites",
-                    json={"domain": "example.com"},
+                    json={"domain": "https://www.wisekey.com", "scan_interval": 21600},
                 )
 
         assert response.status_code == 200
         mock_manager.register_website.assert_awaited_once_with(
-            domain="example.com",
+            domain="www.wisekey.com",
             scan_interval=21600,
+            org_slug="test-org",
         )
 
-    async def test_returns_scanning_status_immediately(self, mock_manager):
-        mock_manager.get_website.return_value = None
-        mock_manager.register_website.return_value = {
-            "domain": "example.com",
-            "status": "idle",
-            "scan_interval": 21600,
-        }
-
-        with patch("app.routers.websites.trigger_scan"):
-            async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-                response = await client.post(
-                    "/api/v1/websites",
-                    json={"domain": "example.com"},
-                )
-
-        assert response.status_code == 200
-        data = response.json()
-        assert data["title"] is None
-        assert data["page_count"] == 0
-        assert data["crawled_count"] == 0
-        assert data["status"] == "scanning"
-
     async def test_409_when_domain_is_deleting(self, mock_manager):
         mock_manager.get_website.return_value = _website_row(status="deleting")
 
@@ -173,12 +160,23 @@ async def test_success(self, mock_manager):
         assert data["domain"] == "example.com"
         assert data["scan_interval"] == 3600
         assert data["status"] == "active"
-        mock_manager.get_website.assert_awaited_once_with("example.com")
         mock_manager.update_scan_interval.assert_awaited_once_with(
             domain="example.com",
             scan_interval=3600,
         )
 
+    async def test_404_when_caller_org_has_no_membership(self, mock_manager):
+        mock_manager.org_has_membership.return_value = False
+
+        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+            response = await client.patch(
+                "/api/v1/websites/example.com",
+                json={"scan_interval": 3600},
+            )
+
+        assert response.status_code == 404
+        mock_manager.update_scan_interval.assert_not_awaited()
+
     async def test_404_when_not_found(self, mock_manager):
         mock_manager.get_website.return_value = None
 
@@ -205,19 +203,6 @@ async def test_409_when_domain_is_deleting(self, mock_manager):
         assert "currently being deleted" in response.json()["detail"]
         mock_manager.update_scan_interval.assert_not_awaited()
 
-    async def test_500_on_error(self, mock_manager):
-        mock_manager.get_website.return_value = _website_row()
-        mock_manager.update_scan_interval.side_effect = RuntimeError("db error")
-
-        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.patch(
-                "/api/v1/websites/example.com",
-                json={"scan_interval": 3600},
-            )
-
-        assert response.status_code == 500
-        assert response.json()["detail"] == "Failed to update website"
-
 
 class TestGetWebsiteInfo:
     async def test_success(self, mock_manager):
@@ -243,39 +228,33 @@ async def test_success(self, mock_manager):
         data = response.json()
         assert data["domain"] == "example.com"
         assert data["title"] == "Example"
-        assert data["description"] == "An example site"
-        assert data["page_count"] == 50
-        assert data["crawled_count"] == 42
         assert data["status"] == "active"
-        assert data["scan_interval"] == 3600
-        assert data["last_scanned_at"] is not None
-        assert data["error"] is None
-        assert data["created_at"] is not None
-        assert data["updated_at"] is not None
-        mock_manager.get_website.assert_awaited_once_with("example.com")
 
-    async def test_404_when_not_found(self, mock_manager):
-        mock_manager.get_website.return_value = None
+    async def test_404_when_caller_org_has_no_membership(self, mock_manager):
+        mock_manager.org_has_membership.return_value = False
 
         async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.get("/api/v1/websites/unknown.com")
+            response = await client.get("/api/v1/websites/example.com")
 
         assert response.status_code == 404
-        assert response.json()["detail"] == "Website not found: unknown.com"
+        mock_manager.get_website.assert_not_awaited()
 
-    async def test_500_on_error(self, mock_manager):
-        mock_manager.get_website.side_effect = RuntimeError("db error")
+    async def test_404_when_not_found(self, mock_manager):
+        mock_manager.get_website.return_value = None
 
         async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.get("/api/v1/websites/example.com")
+            response = await client.get("/api/v1/websites/unknown.com")
 
-        assert response.status_code == 500
-        assert response.json()["detail"] == "Failed to get website info"
+        assert response.status_code == 404
+        assert response.json()["detail"] == "Website not found: unknown.com"
 
 
 class TestDeregisterWebsite:
-    async def test_returns_202_accepted(self, mock_manager):
-        mock_manager.begin_delete.return_value = True
+    async def test_removes_website_when_last_membership(self, mock_manager):
+        mock_manager.begin_delete.return_value = {
+            "removed_membership": True,
+            "removed_website": True,
+        }
 
         with patch("app.routers.websites._spawn_delete_task") as mock_spawn:
             async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
@@ -285,12 +264,32 @@ async def test_returns_202_accepted(self, mock_manager):
         data = response.json()
         assert data["domain"] == "example.com"
         assert data["status"] == "deleting"
-        mock_manager.begin_delete.assert_awaited_once_with("example.com")
+        mock_manager.begin_delete.assert_awaited_once_with("example.com", "test-org")
         mock_spawn.assert_called_once_with(mock_manager, "example.com")
 
-    async def test_404_when_not_found(self, mock_manager):
-        mock_manager.begin_delete.return_value = False
-        mock_manager.get_website.return_value = None
+    async def test_membership_only_when_other_orgs_remain(self, mock_manager):
+        """Other orgs still track this domain: only the caller's membership
+        is removed; website data and crawl schedule stay intact."""
+        mock_manager.begin_delete.return_value = {
+            "removed_membership": True,
+            "removed_website": False,
+        }
+
+        with patch("app.routers.websites._spawn_delete_task") as mock_spawn:
+            async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+                response = await client.delete("/api/v1/websites/example.com")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "membership_removed"
+        # Importantly: no background delete task — data must survive.
+        mock_spawn.assert_not_called()
+
+    async def test_404_when_caller_never_had_membership(self, mock_manager):
+        mock_manager.begin_delete.return_value = {
+            "removed_membership": False,
+            "removed_website": False,
+        }
 
         async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
             response = await client.delete("/api/v1/websites/unknown.com")
@@ -298,18 +297,6 @@ async def test_404_when_not_found(self, mock_manager):
         assert response.status_code == 404
         assert response.json()["detail"] == "Website not found: unknown.com"
 
-    async def test_already_deleting_returns_202(self, mock_manager):
-        mock_manager.begin_delete.return_value = False
-        mock_manager.get_website.return_value = _website_row(status="deleting")
-
-        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.delete("/api/v1/websites/example.com")
-
-        assert response.status_code == 202
-        data = response.json()
-        assert data["domain"] == "example.com"
-        assert data["status"] == "deleting"
-
     async def test_500_on_error(self, mock_manager):
         mock_manager.begin_delete.side_effect = RuntimeError("db error")
 
@@ -332,12 +319,6 @@ async def test_success_with_pagination(self, mock_manager):
                 "status": "active",
                 "last_crawled_at": 1700000000.0,
             },
-            {
-                "url": "https://example.com/page2",
-                "content_hash": "def456",
-                "status": "active",
-                "last_crawled_at": 1700001000.0,
-            },
         ]
         mock_site_store.get_total_count.return_value = 50
 
@@ -347,66 +328,12 @@ async def test_success_with_pagination(self, mock_manager):
         assert response.status_code == 200
         data = response.json()
         assert data["domain"] == "example.com"
-        assert len(data["urls"]) == 2
-        assert data["urls"][0]["url"] == "https://example.com/page1"
-        assert data["urls"][0]["content_hash"] == "abc123"
-        assert data["urls"][1]["url"] == "https://example.com/page2"
         assert data["total"] == 50
-        assert data["offset"] == 0
-        assert data["has_more"] is True
-        mock_site_store.get_urls_page.assert_awaited_once_with(offset=0, limit=2, status=None)
-        mock_site_store.get_total_count.assert_awaited_once_with(status=None)
 
-    async def test_has_more_false_when_at_end(self, mock_manager):
-        mock_manager.get_website.return_value = {"domain": "example.com"}
-        mock_site_store = AsyncMock()
-        mock_manager.get_site_store.return_value = mock_site_store
-        mock_site_store.get_urls_page.return_value = [
-            {
-                "url": "https://example.com/last",
-                "content_hash": "xyz",
-                "status": "active",
-                "last_crawled_at": None,
-            },
-        ]
-        mock_site_store.get_total_count.return_value = 1
-
-        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.get("/api/v1/websites/example.com/urls?offset=0&limit=100")
-
-        assert response.status_code == 200
-        data = response.json()
-        assert data["has_more"] is False
-        assert data["total"] == 1
-
-    async def test_status_filter(self, mock_manager):
-        mock_manager.get_website.return_value = {"domain": "example.com"}
-        mock_site_store = AsyncMock()
-        mock_manager.get_site_store.return_value = mock_site_store
-        mock_site_store.get_urls_page.return_value = []
-        mock_site_store.get_total_count.return_value = 0
-
-        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.get("/api/v1/websites/example.com/urls?status=active")
-
-        assert response.status_code == 200
-        mock_site_store.get_urls_page.assert_awaited_once_with(offset=0, limit=100, status="active")
-        mock_site_store.get_total_count.assert_awaited_once_with(status="active")
-
-    async def test_404_when_website_not_found(self, mock_manager):
-        mock_manager.get_website.return_value = None
-
-        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
-            response = await client.get("/api/v1/websites/unknown.com/urls")
-
-        assert response.status_code == 404
-        assert response.json()["detail"] == "Website not found: unknown.com"
-
-    async def test_500_on_error(self, mock_manager):
-        mock_manager.get_website.side_effect = RuntimeError("db error")
+    async def test_404_when_caller_org_has_no_membership(self, mock_manager):
+        mock_manager.org_has_membership.return_value = False
 
         async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
             response = await client.get("/api/v1/websites/example.com/urls")
 
-        assert response.status_code == 500
-        assert response.json()["detail"] == "Failed to get website URLs"
+        assert response.status_code == 404
diff --git a/services/platform/app/features/organization/components/organization-form.tsx b/services/platform/app/features/organization/components/organization-form.tsx
index 6d9631230a..559c2a1a7b 100644
--- a/services/platform/app/features/organization/components/organization-form.tsx
+++ b/services/platform/app/features/organization/components/organization-form.tsx
@@ -20,6 +20,7 @@ import { toast } from '@/app/hooks/use-toast';
 import { api } from '@/convex/_generated/api';
 import { authClient } from '@/lib/auth-client';
 import { useT } from '@/lib/i18n/client';
+import { isReservedOrgSlug } from '@/lib/shared/constants/reserved-org-slugs';
 
 import { useInitializeDefaultWorkflows } from '../hooks/actions';
 
@@ -49,6 +50,17 @@ export function OrganizationForm() {
           .regex(
             /^[A-Za-z0-9][A-Za-z0-9 _-]*$/,
             'Use letters, digits, spaces, hyphens, and underscores only, starting with a letter or digit.',
+          )
+          .refine(
+            (name) => {
+              const derived = name
+                .trim()
+                .toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '');
+              return !isReservedOrgSlug(derived);
+            },
+            { message: 'This name is reserved by the platform.' },
           ),
       }),
     [t],
diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts
index fa603e7e06..de08adf12d 100644
--- a/services/platform/convex/_generated/api.d.ts
+++ b/services/platform/convex/_generated/api.d.ts
@@ -452,6 +452,7 @@ import type * as lib_crypto_hex_to_bytes from "../lib/crypto/hex_to_bytes.js";
 import type * as lib_crypto_internal_actions from "../lib/crypto/internal_actions.js";
 import type * as lib_debug_log from "../lib/debug_log.js";
 import type * as lib_error_classification from "../lib/error_classification.js";
+import type * as lib_errors_upstream_http_error from "../lib/errors/upstream_http_error.js";
 import type * as lib_file_io from "../lib/file_io.js";
 import type * as lib_fnv1a from "../lib/fnv1a.js";
 import type * as lib_fuzzy_match from "../lib/fuzzy_match.js";
@@ -1549,6 +1550,7 @@ declare const fullApi: ApiFromModules<{
   "lib/crypto/internal_actions": typeof lib_crypto_internal_actions;
   "lib/debug_log": typeof lib_debug_log;
   "lib/error_classification": typeof lib_error_classification;
+  "lib/errors/upstream_http_error": typeof lib_errors_upstream_http_error;
   "lib/file_io": typeof lib_file_io;
   "lib/fnv1a": typeof lib_fnv1a;
   "lib/fuzzy_match": typeof lib_fuzzy_match;
diff --git a/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts b/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
index d24f8c079f..00c6284b27 100644
--- a/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
+++ b/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
@@ -347,7 +347,7 @@ describe('retrieveDocument helper', () => {
 
     await expect(
       retrieveDocument(ctx as never, { fileId: 'file-storage-123' }),
-    ).rejects.toThrow('RAG service error (500)');
+    ).rejects.toThrow(/HTTP 500/);
   });
 
   it('wraps non-JSON response parse error', async () => {
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
index fa26878886..1e880b455e 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
@@ -245,7 +245,7 @@ describe('fetchDocumentComparison', () => {
 
     await expect(
       fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
-    ).rejects.toThrow('RAG service error (500)');
+    ).rejects.toThrow(/HTTP 500/);
   });
 
   it('throws timeout error when fetch is aborted', async () => {
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
index 49af0c4af3..a5282d378a 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
@@ -199,9 +199,7 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(
-      'RAG service error (500)',
-    );
+    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(/HTTP 500/);
   });
 
   it('includes error body text in non-ok error message', async () => {
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
index 626cd84796..abe10b45c2 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
@@ -1,4 +1,5 @@
 import { fetchJson } from '../../../../lib/utils/type-cast-helpers';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { ragFetch } from '../../../lib/helpers/rag_config';
 
 const FETCH_TIMEOUT_MS = 120_000;
@@ -143,8 +144,11 @@ export async function fetchDocumentComparison(
 
     if (!response.ok) {
       const errorText = await response.text().catch(() => '');
-      throw new Error(
-        `RAG service error (${response.status}): ${errorText || 'Unknown error'}`,
+      throw UpstreamHttpError.fromResponse(
+        'rag',
+        response,
+        errorText,
+        '/api/v1/documents/compare',
       );
     }
 
@@ -238,8 +242,11 @@ export async function fetchDocumentComparisonByUrls(
 
     if (!response.ok) {
       const errorText = await response.text().catch(() => '');
-      throw new Error(
-        `RAG service error (${response.status}): ${errorText || 'Unknown error'}`,
+      throw UpstreamHttpError.fromResponse(
+        'rag',
+        response,
+        errorText,
+        '/api/v1/documents/compare-files',
       );
     }
 
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
index 208aa81f57..5e2e50f47d 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
@@ -1,4 +1,5 @@
 import { fetchJson } from '../../../../lib/utils/type-cast-helpers';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { ragFetch } from '../../../lib/helpers/rag_config';
 
 const MAX_CONTENT_CHARS = 50_000;
@@ -64,9 +65,7 @@ export async function fetchDocumentContent(
 
     if (!response.ok) {
       const errorText = await response.text().catch(() => '');
-      throw new Error(
-        `RAG service error (${response.status}): ${errorText || 'Unknown error'}`,
-      );
+      throw UpstreamHttpError.fromResponse('rag', response, errorText, path);
     }
 
     let result: RagContentResponse;
diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
index e6f8da1884..aa0e834aa4 100644
--- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts
+++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
@@ -21,6 +21,7 @@ import { fetchJson } from '../../../lib/utils/type-cast-helpers';
 import { internal } from '../../_generated/api';
 import { stripReservedPromptTags } from '../../lib/agent_response/sanitize_prompt';
 import { createDebugLog } from '../../lib/debug_log';
+import { UpstreamHttpError } from '../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../lib/helpers/org_slug';
 import { ragFetch } from '../../lib/helpers/rag_config';
 import { toId } from '../../lib/type_cast_helpers';
@@ -284,10 +285,16 @@ RESPONSE (list_indexed):
 
         if (!response.ok) {
           const errorText = await response.text().catch(() => '');
-          return {
-            success: false,
-            response: `Failed to retrieve document: ${response.status} ${errorText}`,
-          };
+          const err = UpstreamHttpError.fromResponse(
+            'rag',
+            response,
+            errorText,
+            `/api/v1/documents/${args.fileId}/content`,
+          );
+          // Agent-facing tool path: return the safe summary instead of throwing
+          // so the agent can recover (e.g. show the user "not found" rather than
+          // an opaque tool error).
+          return { success: false, response: err.safeMessage };
         }
 
         interface RetrieveResponse {
@@ -452,7 +459,12 @@ RESPONSE (list_indexed):
 
         if (!response.ok) {
           const errorText = await response.text();
-          throw new Error(`RAG service error: ${response.status} ${errorText}`);
+          throw UpstreamHttpError.fromResponse(
+            'rag',
+            response,
+            errorText,
+            '/api/v1/search',
+          );
         }
 
         const result = await fetchJson<SearchResponse>(response);
diff --git a/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts b/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
index 1f412d152a..4e63e05221 100644
--- a/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
+++ b/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
@@ -9,6 +9,7 @@ import type { ToolCtx } from '@convex-dev/agent';
 
 import { fetchJson } from '../../../../lib/utils/type-cast-helpers';
 import { createDebugLog } from '../../../lib/debug_log';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { getCrawlerServiceUrl } from './get_crawler_service_url';
 import type { WebFetchUrlResult, WebFetchExtractApiResponse } from './types';
 
@@ -26,6 +27,11 @@ export async function fetchAndExtract(
   const crawlerServiceUrl = getCrawlerServiceUrl(ctx.variables);
   const apiUrl = `${crawlerServiceUrl}/api/v1/web/fetch-and-extract`;
 
+  if (!ctx.organizationId) {
+    throw new Error('fetch_and_extract requires organizationId in ToolCtx.');
+  }
+  const orgSlug = await orgSlugFromId(ctx, ctx.organizationId);
+
   debugLog('tool:web:fetch_and_extract start', {
     url: args.url,
     hasInstruction: !!args.instruction,
@@ -37,7 +43,10 @@ export async function fetchAndExtract(
 
     const response = await fetch(apiUrl, {
       method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
+      headers: {
+        'Content-Type': 'application/json',
+        'x-tale-org': orgSlug,
+      },
       body: JSON.stringify({
         url: args.url,
         instruction: args.instruction,
diff --git a/services/platform/convex/auth.ts b/services/platform/convex/auth.ts
index 733f566f16..37b5102648 100644
--- a/services/platform/convex/auth.ts
+++ b/services/platform/convex/auth.ts
@@ -12,6 +12,7 @@ import {
   ownerAc,
 } from 'better-auth/plugins/organization/access';
 
+import { isReservedOrgSlug } from '../lib/shared/constants/reserved-org-slugs';
 import { isRecord, getString } from '../lib/utils/type-guards';
 import { components, internal } from './_generated/api';
 import { DataModel } from './_generated/dataModel';
@@ -575,6 +576,27 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
           beforeCreateOrganization: async (data) => {
             const slug = data.organization.slug;
             if (!slug) return;
+            // Refuse reserved slugs ("default") that the platform pins
+            // global resources to (branding, retention defaults).
+            // Without this, an open-signup user could claim "default"
+            // before the platform seed runs and inherit branding-admin.
+            // Exception: the platform's own first-run seed creates
+            // `default` when no orgs exist yet — let that one through.
+            if (isReservedOrgSlug(slug)) {
+              const anyOrg = await ctx.runQuery(
+                components.betterAuth.adapter.findMany,
+                {
+                  model: 'organization',
+                  paginationOpts: { cursor: null, numItems: 1 },
+                  where: [],
+                },
+              );
+              if (anyOrg && anyOrg.page.length > 0) {
+                throw new APIError('BAD_REQUEST', {
+                  message: `Organization slug "${slug}" is reserved by the platform.`,
+                });
+              }
+            }
             // Convex has no unique-index primitive, so enforce slug uniqueness
             // at application level before Better Auth's adapter writes the row.
             const existing = await ctx.runQuery(
diff --git a/services/platform/convex/branding/internal_queries.ts b/services/platform/convex/branding/internal_queries.ts
index 4b604b6b6f..6c1269ce33 100644
--- a/services/platform/convex/branding/internal_queries.ts
+++ b/services/platform/convex/branding/internal_queries.ts
@@ -1,22 +1,44 @@
 import { v } from 'convex/values';
 
+import { getString, isRecord } from '../../lib/utils/type-guards';
 import { components } from '../_generated/api';
 import { internalQuery } from '../_generated/server';
 import { toPublicUrl } from '../lib/helpers/public_storage_url';
 import { isAdmin } from '../lib/rls/helpers/role_helpers';
 
 const GLOBAL_BINDING_KEY = 'global';
+const DEFAULT_ORG_SLUG = 'default';
 
+/**
+ * Branding is pinned to the `default` org (see `branding/file_actions.ts`
+ * doc comment) — so admin authority over branding must require admin role
+ * IN THE DEFAULT ORG SPECIFICALLY, not "admin in any org". Without this
+ * narrowing, an admin in any user-created org could mutate the platform's
+ * global branding.
+ */
 export const isCallerAdmin = internalQuery({
   args: { userId: v.string() },
   returns: v.boolean(),
   handler: async (ctx, args) => {
+    const orgRes = await ctx.runQuery(components.betterAuth.adapter.findMany, {
+      model: 'organization',
+      paginationOpts: { cursor: null, numItems: 1 },
+      where: [{ field: 'slug', value: DEFAULT_ORG_SLUG, operator: 'eq' }],
+    });
+    const orgRow = orgRes?.page?.[0];
+    if (!isRecord(orgRow)) return false;
+    const defaultOrgId = getString(orgRow, '_id');
+    if (!defaultOrgId) return false;
+
     const memberRes = await ctx.runQuery(
       components.betterAuth.adapter.findMany,
       {
         model: 'member',
-        paginationOpts: { cursor: null, numItems: 10 },
-        where: [{ field: 'userId', value: args.userId, operator: 'eq' }],
+        paginationOpts: { cursor: null, numItems: 1 },
+        where: [
+          { field: 'userId', value: args.userId, operator: 'eq' },
+          { field: 'organizationId', value: defaultOrgId, operator: 'eq' },
+        ],
       },
     );
     for (const member of memberRes?.page ?? []) {
diff --git a/services/platform/convex/documents/generate_document.ts b/services/platform/convex/documents/generate_document.ts
index ca2abdbf4d..a26e637d30 100644
--- a/services/platform/convex/documents/generate_document.ts
+++ b/services/platform/convex/documents/generate_document.ts
@@ -10,6 +10,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { createDebugLog } from '../lib/debug_log';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import {
   buildDownloadUrl,
   buildRequestBody,
@@ -32,6 +33,7 @@ export async function generateDocument(
 
   const endpointPath = getEndpointPath(args.sourceType, args.outputFormat);
   const apiUrl = `${crawlerUrl}${endpointPath}`;
+  const orgSlug = await orgSlugFromId(ctx, args.organizationId);
 
   const requestBody = buildRequestBody(
     args.sourceType,
@@ -55,7 +57,10 @@ export async function generateDocument(
 
   const response = await fetch(apiUrl, {
     method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
+    headers: {
+      'Content-Type': 'application/json',
+      'x-tale-org': orgSlug,
+    },
     body: JSON.stringify(requestBody),
   });
 
diff --git a/services/platform/convex/documents/generate_docx.ts b/services/platform/convex/documents/generate_docx.ts
index 2f7a652360..32d4b5c96e 100644
--- a/services/platform/convex/documents/generate_docx.ts
+++ b/services/platform/convex/documents/generate_docx.ts
@@ -12,6 +12,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { createDebugLog } from '../lib/debug_log';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { buildDownloadUrl, getCrawlerUrl } from './generate_document_helpers';
 
 const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]');
@@ -62,6 +63,7 @@ export async function generateDocx(
 ): Promise<GenerateDocxResult> {
   const crawlerUrl = getCrawlerUrl();
   const apiUrl = `${crawlerUrl}/api/v1/docx`;
+  const orgSlug = await orgSlugFromId(ctx, args.organizationId);
 
   const requestBody = {
     content: args.content,
@@ -74,7 +76,10 @@ export async function generateDocx(
 
   const response = await fetch(apiUrl, {
     method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
+    headers: {
+      'Content-Type': 'application/json',
+      'x-tale-org': orgSlug,
+    },
     body: JSON.stringify(requestBody),
   });
 
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index 8b615fe30b..2fb5e85fa4 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -7,6 +7,7 @@ import { isRecord, getNumber } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
 import { getCrawlerUrl } from '../documents/generate_document_helpers';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
 
 /**
@@ -73,6 +74,7 @@ export const extractFileMetadata = internalAction({
     storageId: v.id('_storage'),
     fileName: v.string(),
     contentType: v.string(),
+    organizationId: v.string(),
     attempt: v.optional(v.number()),
   },
   returns: v.null(),
@@ -117,12 +119,14 @@ export const extractFileMetadata = internalAction({
         const fileBlob = await fileResponse.blob();
         const crawlerUrl = getCrawlerUrl();
         const endpoint = `${crawlerUrl}/api/v1/${ext}/extract-metadata`;
+        const orgSlug = await orgSlugFromId(ctx, args.organizationId);
 
         const formData = new FormData();
         formData.append('file', fileBlob, args.fileName);
 
         const metadataResponse = await fetch(endpoint, {
           method: 'POST',
+          headers: { 'x-tale-org': orgSlug },
           body: formData,
           signal: AbortSignal.timeout(30_000),
         });
@@ -196,6 +200,7 @@ export const extractFileMetadata = internalAction({
               storageId: args.storageId,
               fileName: args.fileName,
               contentType: args.contentType,
+              organizationId: args.organizationId,
               attempt: attempt + 1,
             },
           );
diff --git a/services/platform/convex/file_metadata/internal_mutations.ts b/services/platform/convex/file_metadata/internal_mutations.ts
index 96f162636e..42c8f250fd 100644
--- a/services/platform/convex/file_metadata/internal_mutations.ts
+++ b/services/platform/convex/file_metadata/internal_mutations.ts
@@ -130,6 +130,7 @@ export const saveFileMetadata = internalMutation({
         storageId: args.storageId,
         fileName: args.fileName,
         contentType: args.contentType,
+        organizationId: args.organizationId,
       },
     );
 
diff --git a/services/platform/convex/file_metadata/mutations.ts b/services/platform/convex/file_metadata/mutations.ts
index dae28fb083..1c8d9a8167 100644
--- a/services/platform/convex/file_metadata/mutations.ts
+++ b/services/platform/convex/file_metadata/mutations.ts
@@ -197,6 +197,7 @@ export const saveFileMetadata = mutation({
         storageId: args.storageId,
         fileName: args.fileName,
         contentType: args.contentType,
+        organizationId: args.organizationId,
       },
     );
 
diff --git a/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
new file mode 100644
index 0000000000..1e784fb7f1
--- /dev/null
+++ b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
@@ -0,0 +1,95 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  isRetryableStatus,
+  isUpstreamHttpError,
+  UpstreamHttpError,
+} from '../upstream_http_error';
+
+function makeResponse(status: number): Response {
+  // Minimal Response stand-in — UpstreamHttpError.fromResponse only reads `.status`.
+  return new Response(null, { status });
+}
+
+describe('UpstreamHttpError', () => {
+  it('scrubs Bearer tokens and sk- API keys from body snippet', () => {
+    const body =
+      'Upstream complained: Authorization: Bearer sk-abcdefgh1234567890ABCDEF';
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(500),
+      body,
+      '/api/v1/search',
+    );
+    expect(err.bodySnippet).not.toMatch(/sk-abcdefgh/);
+    expect(err.bodySnippet).not.toMatch(/Bearer\s+sk-/);
+    expect(err.bodySnippet).toMatch(/REDACTED/);
+    // Engineer-facing message still embeds the (now-scrubbed) snippet for triage.
+    expect(err.message).toMatch(/REDACTED/);
+    // Safe message is clean of any body content.
+    expect(err.safeMessage).not.toMatch(/REDACTED/);
+    expect(err.safeMessage).toMatch(/RAG/);
+  });
+
+  it('truncates very long bodies to ~400 chars', () => {
+    const body = 'X'.repeat(2000);
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(500),
+      body,
+      '/api/v1/search',
+    );
+    expect(err.bodySnippet.length).toBeLessThanOrEqual(401); // 400 + ellipsis
+  });
+
+  it('marks 5xx / 408 / 429 as retryable; 4xx (other) as not', () => {
+    expect(isRetryableStatus(500)).toBe(true);
+    expect(isRetryableStatus(503)).toBe(true);
+    expect(isRetryableStatus(429)).toBe(true);
+    expect(isRetryableStatus(408)).toBe(true);
+    expect(isRetryableStatus(400)).toBe(false);
+    expect(isRetryableStatus(401)).toBe(false);
+    expect(isRetryableStatus(404)).toBe(false);
+
+    const fiveHundred = UpstreamHttpError.fromResponse(
+      'crawler',
+      makeResponse(500),
+      'down',
+      '/api/v1/web/fetch-and-extract',
+    );
+    expect(fiveHundred.retryable).toBe(true);
+
+    const fourHundred = UpstreamHttpError.fromResponse(
+      'crawler',
+      makeResponse(400),
+      'bad request',
+      '/api/v1/urls/discover',
+    );
+    expect(fourHundred.retryable).toBe(false);
+  });
+
+  it('safe message includes service, endpoint, and status', () => {
+    const err = UpstreamHttpError.fromResponse(
+      'crawler',
+      makeResponse(503),
+      '',
+      '/api/v1/web/fetch-and-extract',
+    );
+    expect(err.safeMessage).toContain('CRAWLER');
+    expect(err.safeMessage).toContain('/api/v1/web/fetch-and-extract');
+    expect(err.safeMessage).toContain('503');
+  });
+
+  it('isUpstreamHttpError narrows correctly', () => {
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(500),
+      '',
+      '/x',
+    );
+    expect(isUpstreamHttpError(err)).toBe(true);
+    expect(isUpstreamHttpError(new Error('other'))).toBe(false);
+    expect(isUpstreamHttpError(null)).toBe(false);
+    expect(isUpstreamHttpError('string')).toBe(false);
+  });
+});
diff --git a/services/platform/convex/lib/errors/upstream_http_error.ts b/services/platform/convex/lib/errors/upstream_http_error.ts
new file mode 100644
index 0000000000..6157044622
--- /dev/null
+++ b/services/platform/convex/lib/errors/upstream_http_error.ts
@@ -0,0 +1,108 @@
+/**
+ * Typed wrapper for non-2xx HTTP responses from upstream services
+ * (RAG, Crawler). Centralizes:
+ *
+ * - Body truncation + secret scrubbing (via `sanitizeError`) so raw
+ *   provider errors with embedded API keys, filenames, or stack
+ *   fragments never reach a thrown Error message.
+ * - `retryable` flag derived from status, so callers can decide
+ *   without re-parsing the message.
+ * - A `safeMessage` field with a user-presentable one-liner that
+ *   omits the body snippet entirely; UI surfaces should prefer this.
+ *
+ * Use the static factory `UpstreamHttpError.fromResponse(...)`; raw
+ * `new UpstreamHttpError({...})` is reserved for tests.
+ */
+
+import { sanitizeError } from '../utils/sanitize_secrets';
+
+export type UpstreamService = 'rag' | 'crawler';
+
+const BODY_SNIPPET_MAX = 400;
+
+export interface UpstreamErrorInit {
+  service: UpstreamService;
+  status: number;
+  endpoint: string;
+  bodySnippet: string;
+  retryable: boolean;
+  safeMessage: string;
+}
+
+/** Status codes the platform should retry on (transient upstream). */
+export function isRetryableStatus(status: number): boolean {
+  return status === 408 || status === 429 || (status >= 500 && status < 600);
+}
+
+function safeMessageFor(
+  service: UpstreamService,
+  status: number,
+  endpoint: string,
+): string {
+  // User-facing summary: never includes body, never includes secrets.
+  // Operators get the full picture from logs + the thrown Error message.
+  const where = `${service.toUpperCase()} ${endpoint}`;
+  if (status === 401 || status === 403) {
+    return `${where} authentication failed (HTTP ${status}).`;
+  }
+  if (status === 404) {
+    return `${where} returned not found (HTTP 404).`;
+  }
+  if (status === 408 || status === 429) {
+    return `${where} is throttling (HTTP ${status}); retry shortly.`;
+  }
+  if (status >= 500) {
+    return `${where} is unavailable (HTTP ${status}); retry shortly.`;
+  }
+  return `${where} returned HTTP ${status}.`;
+}
+
+export class UpstreamHttpError extends Error {
+  readonly service: UpstreamService;
+  readonly status: number;
+  readonly endpoint: string;
+  readonly bodySnippet: string;
+  readonly retryable: boolean;
+  readonly safeMessage: string;
+
+  constructor(init: UpstreamErrorInit) {
+    // Engineer-facing message: includes the scrubbed snippet for log
+    // triage. UI code MUST read `.safeMessage` instead of `.message`
+    // to keep this snippet out of user-visible surfaces.
+    const snippet = init.bodySnippet ? ` — ${init.bodySnippet}` : '';
+    super(`${init.safeMessage}${snippet}`);
+    this.name = 'UpstreamHttpError';
+    this.service = init.service;
+    this.status = init.status;
+    this.endpoint = init.endpoint;
+    this.bodySnippet = init.bodySnippet;
+    this.retryable = init.retryable;
+    this.safeMessage = init.safeMessage;
+  }
+
+  /**
+   * Build an UpstreamHttpError from a non-2xx Response and its already-read
+   * body text. Callers should always `await response.text()` first (don't
+   * pass the unread Response — single-use body).
+   */
+  static fromResponse(
+    service: UpstreamService,
+    response: Response,
+    bodyText: string,
+    endpoint: string,
+  ): UpstreamHttpError {
+    return new UpstreamHttpError({
+      service,
+      status: response.status,
+      endpoint,
+      bodySnippet: sanitizeError(bodyText, BODY_SNIPPET_MAX),
+      retryable: isRetryableStatus(response.status),
+      safeMessage: safeMessageFor(service, response.status, endpoint),
+    });
+  }
+}
+
+/** Narrow `unknown` to UpstreamHttpError for catch-block branching. */
+export function isUpstreamHttpError(err: unknown): err is UpstreamHttpError {
+  return err instanceof UpstreamHttpError;
+}
diff --git a/services/platform/convex/websites/actions.ts b/services/platform/convex/websites/actions.ts
index 67ba2dea19..c4aac91d0d 100644
--- a/services/platform/convex/websites/actions.ts
+++ b/services/platform/convex/websites/actions.ts
@@ -4,6 +4,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import { action } from '../_generated/server';
 import { authComponent } from '../auth';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { toWebsiteDomain } from './create_website';
 import {
   deregisterDomainFromCrawler,
@@ -56,7 +57,12 @@ export const createWebsite = action({
     await ctx.scheduler.runAfter(
       0,
       internal.websites.internal_actions.registerAndSync,
-      { websiteId, domain, scanInterval: args.scanInterval },
+      {
+        websiteId,
+        domain,
+        scanInterval: args.scanInterval,
+        organizationId: args.organizationId,
+      },
     );
 
     return websiteId;
@@ -88,8 +94,9 @@ export const deleteWebsite = action({
       },
     );
 
+    const orgSlug = await orgSlugFromId(ctx, website.organizationId);
     // Deregister from crawler first — if this fails, the user can retry
-    await deregisterDomainFromCrawler(website.domain);
+    await deregisterDomainFromCrawler(orgSlug, website.domain);
 
     await ctx.runMutation(internal.websites.internal_mutations.deleteWebsite, {
       websiteId: args.websiteId,
@@ -130,8 +137,13 @@ export const updateWebsite = action({
 
     // Sync scan interval to crawler
     if (args.scanInterval && args.scanInterval !== website.scanInterval) {
+      const orgSlug = await orgSlugFromId(ctx, website.organizationId);
       try {
-        await updateCrawlerScanInterval(website.domain, args.scanInterval);
+        await updateCrawlerScanInterval(
+          orgSlug,
+          website.domain,
+          args.scanInterval,
+        );
       } catch (error) {
         if (
           error instanceof Error &&
@@ -236,13 +248,18 @@ export const fetchPages = action({
     await ctx.scheduler.runAfter(
       0,
       internal.websites.internal_actions.syncSingleWebsite,
-      { websiteId: args.websiteId, domain: website.domain },
+      {
+        websiteId: args.websiteId,
+        domain: website.domain,
+        organizationId: website.organizationId,
+      },
     );
 
     return await ctx.runAction(
       internal.websites.internal_actions.fetchWebsitePages,
       {
         domain: website.domain,
+        organizationId: website.organizationId,
         offset: args.offset,
         limit: args.limit,
       },
@@ -267,7 +284,11 @@ export const fetchChunks = action({
 
     return await ctx.runAction(
       internal.websites.internal_actions.fetchPageChunks,
-      { domain: website.domain, url: args.url },
+      {
+        domain: website.domain,
+        url: args.url,
+        organizationId: website.organizationId,
+      },
     );
   },
 });
@@ -290,7 +311,12 @@ export const searchContent = action({
 
     return await ctx.runAction(
       internal.websites.internal_actions.searchWebsiteContent,
-      { domain: website.domain, query: args.query, limit: args.limit },
+      {
+        domain: website.domain,
+        query: args.query,
+        organizationId: website.organizationId,
+        limit: args.limit,
+      },
     );
   },
 });
diff --git a/services/platform/convex/websites/internal_actions.ts b/services/platform/convex/websites/internal_actions.ts
index 7f0ea485d5..7b34b031b8 100644
--- a/services/platform/convex/websites/internal_actions.ts
+++ b/services/platform/convex/websites/internal_actions.ts
@@ -4,6 +4,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import { internalAction } from '../_generated/server';
 import { getCrawlerUrl } from '../documents/generate_document_helpers';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import type {
   CrawlerChunksResponse,
   CrawlerPagesResponse,
@@ -14,16 +15,27 @@ import type {
 const CRAWLER_TIMEOUT_MS = 15_000;
 const SYNC_INTERVAL_MS = 60 * 60 * 1000; // 1 hour
 
+/**
+ * Wrap `fetch` with a timeout and inject the required `x-tale-org`
+ * header so every call to the crawler service routes to the correct
+ * org's provider catalog. Crawler enforces this header at the router
+ * level (`require_org_slug`); missing it returns HTTP 400.
+ */
 function fetchWithTimeout(
   url: string,
+  orgSlug: string,
   init?: RequestInit,
   timeoutMs = CRAWLER_TIMEOUT_MS,
 ): Promise<Response> {
   const controller = new AbortController();
   const timer = setTimeout(() => controller.abort(), timeoutMs);
-  return fetch(url, { ...init, signal: controller.signal }).finally(() =>
-    clearTimeout(timer),
-  );
+  const mergedHeaders = new Headers(init?.headers);
+  mergedHeaders.set('x-tale-org', orgSlug);
+  return fetch(url, {
+    ...init,
+    headers: mergedHeaders,
+    signal: controller.signal,
+  }).finally(() => clearTimeout(timer));
 }
 
 export function scanIntervalToSeconds(interval: string): number {
@@ -48,12 +60,14 @@ export function scanIntervalToSeconds(interval: string): number {
 }
 
 export async function registerDomainWithCrawler(
+  orgSlug: string,
   domain: string,
   scanInterval: string,
 ): Promise<CrawlerWebsiteInfo> {
   const crawlerUrl = getCrawlerUrl();
   const res = await fetchWithTimeout(
     `${crawlerUrl}/api/v1/websites`,
+    orgSlug,
     {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
@@ -73,12 +87,14 @@ export async function registerDomainWithCrawler(
 }
 
 export async function updateCrawlerScanInterval(
+  orgSlug: string,
   domain: string,
   scanInterval: string,
 ): Promise<void> {
   const crawlerUrl = getCrawlerUrl();
   const res = await fetchWithTimeout(
     `${crawlerUrl}/api/v1/websites/${encodeURIComponent(domain)}`,
+    orgSlug,
     {
       method: 'PATCH',
       headers: { 'Content-Type': 'application/json' },
@@ -98,11 +114,13 @@ export async function updateCrawlerScanInterval(
 }
 
 export async function deregisterDomainFromCrawler(
+  orgSlug: string,
   domain: string,
 ): Promise<void> {
   const crawlerUrl = getCrawlerUrl();
   const res = await fetchWithTimeout(
     `${crawlerUrl}/api/v1/websites/${encodeURIComponent(domain)}`,
+    orgSlug,
     { method: 'DELETE' },
   );
   if (!res.ok && res.status !== 404) {
@@ -113,11 +131,13 @@ export async function deregisterDomainFromCrawler(
 }
 
 export async function fetchWebsiteInfo(
+  orgSlug: string,
   domain: string,
 ): Promise<CrawlerWebsiteInfo | null> {
   const crawlerUrl = getCrawlerUrl();
   const res = await fetchWithTimeout(
     `${crawlerUrl}/api/v1/websites/${encodeURIComponent(domain)}`,
+    orgSlug,
   );
   if (res.ok) {
     return await res.json();
@@ -136,11 +156,13 @@ interface WebsiteForSync {
 }
 
 async function fetchHomepageMetadata(
+  orgSlug: string,
   domain: string,
 ): Promise<{ title?: string; description?: string } | null> {
   const crawlerUrl = getCrawlerUrl();
   const res = await fetchWithTimeout(
     `${crawlerUrl}/api/v1/urls/fetch`,
+    orgSlug,
     {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
@@ -170,9 +192,11 @@ export const fetchAndPatchHomepage = internalAction({
   args: {
     websiteId: v.id('websites'),
     domain: v.string(),
+    organizationId: v.string(),
   },
   handler: async (ctx, args): Promise<void> => {
-    const info = await fetchHomepageMetadata(args.domain);
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
+    const info = await fetchHomepageMetadata(orgSlug, args.domain);
     if (!info) return;
 
     await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, {
@@ -188,6 +212,7 @@ export const syncWebsiteStatuses = internalAction({
     organizationId: v.string(),
   },
   handler: async (ctx, args): Promise<void> => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     const websites: WebsiteForSync[] = await ctx.runQuery(
       internal.websites.internal_queries.listWebsitesForSync,
       { organizationId: args.organizationId },
@@ -202,7 +227,7 @@ export const syncWebsiteStatuses = internalAction({
       }
 
       try {
-        const websiteInfo = await fetchWebsiteInfo(website.domain);
+        const websiteInfo = await fetchWebsiteInfo(orgSlug, website.domain);
 
         if (websiteInfo) {
           await ctx.runMutation(
@@ -264,10 +289,12 @@ export const registerAndSync = internalAction({
     websiteId: v.id('websites'),
     domain: v.string(),
     scanInterval: v.string(),
+    organizationId: v.string(),
   },
   handler: async (ctx, args): Promise<void> => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     try {
-      await registerDomainWithCrawler(args.domain, args.scanInterval);
+      await registerDomainWithCrawler(orgSlug, args.domain, args.scanInterval);
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.error(
@@ -286,14 +313,22 @@ export const registerAndSync = internalAction({
     await ctx.scheduler.runAfter(
       0,
       internal.websites.internal_actions.fetchAndPatchHomepage,
-      { websiteId: args.websiteId, domain: args.domain },
+      {
+        websiteId: args.websiteId,
+        domain: args.domain,
+        organizationId: args.organizationId,
+      },
     );
 
     // Schedule a delayed sync to pick up scan results
     await ctx.scheduler.runAfter(
       600_000,
       internal.websites.internal_actions.syncSingleWebsite,
-      { websiteId: args.websiteId, domain: args.domain },
+      {
+        websiteId: args.websiteId,
+        domain: args.domain,
+        organizationId: args.organizationId,
+      },
     );
   },
 });
@@ -302,8 +337,10 @@ export const syncSingleWebsite = internalAction({
   args: {
     websiteId: v.id('websites'),
     domain: v.string(),
+    organizationId: v.string(),
   },
   handler: async (ctx, args): Promise<void> => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     const website = await ctx.runQuery(
       internal.websites.internal_queries.getWebsite,
       { websiteId: args.websiteId },
@@ -311,7 +348,7 @@ export const syncSingleWebsite = internalAction({
     if (!website) return;
 
     try {
-      const info = await fetchWebsiteInfo(args.domain);
+      const info = await fetchWebsiteInfo(orgSlug, args.domain);
 
       if (info) {
         await ctx.runMutation(
@@ -366,16 +403,19 @@ export const syncSingleWebsite = internalAction({
 export const fetchWebsitePages = internalAction({
   args: {
     domain: v.string(),
+    organizationId: v.string(),
     offset: v.optional(v.number()),
     limit: v.optional(v.number()),
   },
-  handler: async (_ctx, args) => {
+  handler: async (ctx, args) => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     const crawlerUrl = getCrawlerUrl();
     const offset = args.offset ?? 0;
     const limit = args.limit ?? 100;
 
     const res = await fetchWithTimeout(
       `${crawlerUrl}/api/v1/pages/${encodeURIComponent(args.domain)}?offset=${offset}&limit=${limit}`,
+      orgSlug,
     );
 
     if (!res.ok) {
@@ -396,12 +436,15 @@ export const fetchPageChunks = internalAction({
   args: {
     domain: v.string(),
     url: v.string(),
+    organizationId: v.string(),
   },
-  handler: async (_ctx, args) => {
+  handler: async (ctx, args) => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     const crawlerUrl = getCrawlerUrl();
 
     const res = await fetchWithTimeout(
       `${crawlerUrl}/api/v1/pages/${encodeURIComponent(args.domain)}/chunks?url=${encodeURIComponent(args.url)}`,
+      orgSlug,
     );
 
     if (!res.ok) {
@@ -421,14 +464,17 @@ export const searchWebsiteContent = internalAction({
   args: {
     domain: v.string(),
     query: v.string(),
+    organizationId: v.string(),
     limit: v.optional(v.number()),
   },
-  handler: async (_ctx, args) => {
+  handler: async (ctx, args) => {
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
     const crawlerUrl = getCrawlerUrl();
     const limit = args.limit ?? 10;
 
     const res = await fetchWithTimeout(
       `${crawlerUrl}/api/v1/search/${encodeURIComponent(args.domain)}`,
+      orgSlug,
       {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
diff --git a/services/platform/convex/websites/rest_api.ts b/services/platform/convex/websites/rest_api.ts
index b7c34b24b3..2c3a1a959a 100644
--- a/services/platform/convex/websites/rest_api.ts
+++ b/services/platform/convex/websites/rest_api.ts
@@ -76,6 +76,7 @@ export const createWebsite = withRestAuth('rest:api', async (rc, request) => {
     websiteId,
     domain,
     scanInterval: body.scanInterval,
+    organizationId: rc.org.organizationId,
   });
 
   return jsonCreated({ id: websiteId });
@@ -127,6 +128,7 @@ export const getWebsite = withRestAuth('rest:api', async (rc, request) => {
       internal.websites.internal_actions.fetchWebsitePages,
       {
         domain: website.domain,
+        organizationId: website.organizationId,
         offset,
         limit,
       },
@@ -231,6 +233,7 @@ export const websiteSubActions = withRestAuth(
         internal.websites.internal_actions.fetchWebsitePages,
         {
           domain: website.domain,
+          organizationId: website.organizationId,
           offset,
           limit,
         },
@@ -287,6 +290,7 @@ export const websitePostActions = withRestAuth(
         {
           domain: website.domain,
           query: body.query,
+          organizationId: website.organizationId,
           limit: body.limit,
         },
       );
diff --git a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
index f153033682..78d6bc7db3 100644
--- a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
@@ -1,6 +1,7 @@
 import { v } from 'convex/values';
 
 import { createDebugLog } from '../../../lib/debug_log';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import type { ActionDefinition } from '../../helpers/nodes/action/types';
 import type {
   CrawlerActionParams,
@@ -51,17 +52,28 @@ export const crawlerAction: ActionDefinition<CrawlerActionParams> = {
     }),
   ),
 
-  async execute(_ctx, params) {
+  async execute(ctx, params, variables) {
     const serviceUrl = process.env.CRAWLER_URL || 'http://localhost:8002';
     const timeout = params.timeout || 1800000;
 
+    const organizationId =
+      typeof variables.organizationId === 'string'
+        ? variables.organizationId
+        : undefined;
+    if (!organizationId) {
+      throw new Error(
+        'crawler action requires organizationId in workflow _variables.',
+      );
+    }
+    const orgSlug = await orgSlugFromId(ctx, organizationId);
+
     switch (params.operation) {
       case 'discover_urls':
-        return await discoverUrls(params, serviceUrl, timeout);
+        return await discoverUrls(params, serviceUrl, orgSlug, timeout);
       case 'fetch_urls':
-        return await fetchUrls(params, serviceUrl, timeout);
+        return await fetchUrls(params, serviceUrl, orgSlug, timeout);
       case 'query_urls':
-        return await queryUrls(params, serviceUrl, timeout);
+        return await queryUrls(params, serviceUrl, orgSlug, timeout);
       default:
         throw new Error(
           `Unknown crawler operation: ${(params as { operation: string }).operation}`,
@@ -88,6 +100,7 @@ type QueryUrlsParams = Extract<
 async function discoverUrls(
   params: DiscoverUrlsParams,
   serviceUrl: string,
+  orgSlug: string,
   timeout: number,
 ): Promise<DiscoverUrlsResult> {
   let domain = params.domain;
@@ -119,6 +132,7 @@ async function discoverUrls(
     method: 'POST',
     headers: {
       'Content-Type': 'application/json',
+      'x-tale-org': orgSlug,
     },
     body: JSON.stringify(payload),
     signal: controller.signal,
@@ -158,6 +172,7 @@ async function discoverUrls(
 async function fetchUrls(
   params: FetchUrlsParams,
   serviceUrl: string,
+  orgSlug: string,
   timeout: number,
 ): Promise<FetchUrlsResult> {
   const payload = {
@@ -175,6 +190,7 @@ async function fetchUrls(
     method: 'POST',
     headers: {
       'Content-Type': 'application/json',
+      'x-tale-org': orgSlug,
     },
     body: JSON.stringify(payload),
     signal: controller.signal,
@@ -206,6 +222,7 @@ async function fetchUrls(
 async function queryUrls(
   params: QueryUrlsParams,
   serviceUrl: string,
+  orgSlug: string,
   timeout: number,
 ): Promise<QueryUrlsResult> {
   const searchParams = new URLSearchParams();
@@ -224,7 +241,10 @@ async function queryUrls(
 
   const response = await fetch(
     `${serviceUrl}/api/v1/websites/${encodeURIComponent(params.domain)}/urls?${searchParams}`,
-    { signal: controller.signal },
+    {
+      headers: { 'x-tale-org': orgSlug },
+      signal: controller.signal,
+    },
   );
 
   clearTimeout(timeoutId);
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index 9c49f699e8..42c567c370 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -582,7 +582,16 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
       }
 
       case 'extract_docx_structured': {
-        return await extractDocxStructured(ctx, params.fileId);
+        const organizationId =
+          typeof _variables.organizationId === 'string'
+            ? _variables.organizationId
+            : undefined;
+        if (!organizationId) {
+          throw new Error(
+            'extract_docx_structured requires organizationId in workflow _variables.',
+          );
+        }
+        return await extractDocxStructured(ctx, params.fileId, organizationId);
       }
 
       case 'apply_docx_structured': {
@@ -590,6 +599,11 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
             : undefined;
+        if (!organizationId) {
+          throw new Error(
+            'apply_docx_structured requires organizationId in workflow _variables.',
+          );
+        }
 
         return await applyDocxStructured(ctx, {
           templateFileId: params.templateFileId,
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
index c4dddcf758..3ea45ce39f 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
@@ -19,6 +19,7 @@ import {
   getCrawlerUrl,
 } from '../../../../documents/generate_document_helpers';
 import { createDebugLog } from '../../../../lib/debug_log';
+import { orgSlugFromId } from '../../../../lib/helpers/org_slug';
 import { toId } from '../../../../lib/type_cast_helpers';
 
 const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]');
@@ -53,9 +54,9 @@ export interface ApplyDocxStructuredArgs {
   sourceHash: string;
   modifications: Modification[];
   fileName: string;
+  organizationId: string;
   trackChanges?: boolean;
   author?: string;
-  organizationId?: string;
 }
 
 export interface ApplyDocxStructuredResult {
@@ -116,8 +117,11 @@ export async function applyDocxStructured(
   const controller = new AbortController();
   const timeoutId = setTimeout(() => controller.abort(), 300_000);
 
+  const orgSlug = await orgSlugFromId(ctx, args.organizationId);
+
   const response = await fetch(apiUrl, {
     method: 'POST',
+    headers: { 'x-tale-org': orgSlug },
     body: formData,
     signal: controller.signal,
   });
@@ -162,20 +166,18 @@ export async function applyDocxStructured(
     ? args.fileName
     : `${args.fileName}.docx`;
 
-  // Save file metadata if organizationId is available
-  if (args.organizationId) {
-    await ctx.runMutation(
-      internal.file_metadata.internal_mutations.saveFileMetadata,
-      {
-        organizationId: args.organizationId,
-        storageId,
-        fileName: finalFileName,
-        contentType: DOCX_CONTENT_TYPE,
-        size: docxBytes.length,
-        source: 'agent',
-      },
-    );
-  }
+  // Save file metadata so the file shows up in the org's library.
+  await ctx.runMutation(
+    internal.file_metadata.internal_mutations.saveFileMetadata,
+    {
+      organizationId: args.organizationId,
+      storageId,
+      fileName: finalFileName,
+      contentType: DOCX_CONTENT_TYPE,
+      size: docxBytes.length,
+      source: 'agent',
+    },
+  );
 
   const downloadUrl = buildDownloadUrl(storageId, finalFileName);
 
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
index a057e9bffc..4a7d1fc5b1 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
@@ -11,6 +11,7 @@ import { fetchJson } from '../../../../../lib/utils/type-cast-helpers';
 import type { ActionCtx } from '../../../../_generated/server';
 import { getCrawlerUrl } from '../../../../documents/generate_document_helpers';
 import { createDebugLog } from '../../../../lib/debug_log';
+import { orgSlugFromId } from '../../../../lib/helpers/org_slug';
 import { toId } from '../../../../lib/type_cast_helpers';
 
 const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]');
@@ -38,9 +39,11 @@ export interface ExtractDocxStructuredResult {
 export async function extractDocxStructured(
   ctx: ActionCtx,
   fileId: string,
+  organizationId: string,
 ): Promise<ExtractDocxStructuredResult> {
   const crawlerUrl = getCrawlerUrl();
   const apiUrl = `${crawlerUrl}/api/v1/docx/extract-structured`;
+  const orgSlug = await orgSlugFromId(ctx, organizationId);
 
   debugLog('extractDocxStructured start', { fileId });
 
@@ -59,6 +62,7 @@ export async function extractDocxStructured(
 
   const response = await fetch(apiUrl, {
     method: 'POST',
+    headers: { 'x-tale-org': orgSlug },
     body: formData,
     signal: controller.signal,
   });
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
index b71a449288..29d033c86b 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
@@ -8,6 +8,7 @@ import {
   isRecord,
 } from '../../../../../lib/utils/type-guards';
 import { internalAction } from '../../../../_generated/server';
+import { UpstreamHttpError } from '../../../../lib/errors/upstream_http_error';
 import { ragFetch } from '../../../../lib/helpers/rag_config';
 import type { RagDeleteResult } from './types';
 
@@ -54,7 +55,12 @@ export async function deleteDocumentById({
 
     if (!response.ok) {
       const errorText = await response.text();
-      throw new Error(`RAG service error: ${response.status} ${errorText}`);
+      throw UpstreamHttpError.fromResponse(
+        'rag',
+        response,
+        errorText,
+        `/api/v1/documents/${fileId}`,
+      );
     }
 
     const rawResult: unknown = await response.json();
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
index 6124589290..25473ca191 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
@@ -141,20 +141,33 @@ describe('uploadFile', () => {
     expect(calledUrl).toBe(`${RAG_URL}/api/v1/documents/upload`);
   });
 
-  it('throws on non-ok response with status info', async () => {
+  it('throws UpstreamHttpError with sanitized body snippet on non-ok response', async () => {
     mockFetchError(500, 'Internal Server Error', 'something broke');
 
-    await expect(uploadFile(defaultArgs())).rejects.toThrow(
-      'RAG service error: 500 Internal Server Error - something broke',
+    const err = await uploadFile(defaultArgs()).then(
+      () => null,
+      (e: unknown) => e,
     );
+    expect(err).toBeInstanceOf(Error);
+    expect((err as Error).name).toBe('UpstreamHttpError');
+    // Engineer-facing .message embeds the safe-summary + sanitized body.
+    expect((err as Error).message).toMatch(/HTTP 500/);
+    expect((err as Error).message).toMatch(/something broke/);
+    // Retryable for 5xx — caller can decide whether to bounce.
+    expect((err as { retryable?: boolean }).retryable).toBe(true);
   });
 
-  it('throws on non-ok response without body', async () => {
+  it('throws UpstreamHttpError on non-ok response with empty body', async () => {
     mockFetchError(502, 'Bad Gateway');
 
-    await expect(uploadFile(defaultArgs())).rejects.toThrow(
-      'RAG service error: 502 Bad Gateway',
+    const err = await uploadFile(defaultArgs()).then(
+      () => null,
+      (e: unknown) => e,
     );
+    expect(err).toBeInstanceOf(Error);
+    expect((err as Error).name).toBe('UpstreamHttpError');
+    expect((err as Error).message).toMatch(/HTTP 502/);
+    expect((err as { retryable?: boolean }).retryable).toBe(true);
   });
 
   it('returns correct RagUploadResult shape on success', async () => {
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
index d3d0b5b62c..3444b127fe 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.ts
@@ -1,3 +1,4 @@
+import { UpstreamHttpError } from '../../../../lib/errors/upstream_http_error';
 import { ragFetch } from '../../../../lib/helpers/rag_config';
 import type { RagUploadResult } from './types';
 
@@ -61,9 +62,7 @@ export async function uploadFile({
 
   if (!response.ok) {
     const errorText = await response.text();
-    throw new Error(
-      `RAG service error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ''}`,
-    );
+    throw UpstreamHttpError.fromResponse('rag', response, errorText, path);
   }
 
   // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- typed response
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index 7a41435aa7..d9a036d942 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -5,6 +5,7 @@ import { internal } from '../../../_generated/api';
 import type { ActionCtx } from '../../../_generated/server';
 import type { SearchResponse } from '../../../agent_tools/rag/format_search_results';
 import { fetchDocumentChunks } from '../../../agent_tools/rag/helpers/fetch_document_chunks';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { ragFetch } from '../../../lib/helpers/rag_config';
 import { toId } from '../../../lib/type_cast_helpers';
@@ -124,8 +125,11 @@ export const ragAction: ActionDefinition<RagActionParams> = {
 
           if (!response.ok) {
             const errorText = await response.text().catch(() => '');
-            throw new Error(
-              `RAG search error (${response.status}): ${errorText || 'Unknown error'}`,
+            throw UpstreamHttpError.fromResponse(
+              'rag',
+              response,
+              errorText,
+              '/api/v1/search',
             );
           }
 
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index 1ee647ed3a..b9ac31862e 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -18,10 +18,25 @@ const ATOMIC_WRITE_TMP_RE = /\.\d+\.[a-f0-9]{8}\.tmp$/;
 // Must match validateOrgSlug at services/platform/convex/lib/file_io.ts.
 const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
 
+/**
+ * Tail-debounce window for SSE invalidations: events arriving within this
+ * window for the same (type, orgSlug, slug) key collapse to a single
+ * delivery. Bulk operations (org migrations, mass `git mv`) would
+ * otherwise fan out one SSE frame per file × per connected client.
+ */
+const EMIT_DEBOUNCE_MS = 100;
+
 /**
  * Parse a relative path within the config directory into a structured event,
  * under the uniform org-first layout `${TALE_CONFIG_DIR}/<orgSlug>/<domain>/...`.
  *
+ * Per-domain file filter (a write must match the domain's content shape;
+ * otherwise the event is dropped):
+ *   - agents / workflows / providers / branding / integrations: `.json` only
+ *   - skills: any file (`SKILL.md`, `scripts/*.py`, assets) — skill query
+ *     keys are invalidated at slug granularity, so any write under the slug
+ *     dir must emit.
+ *
  * Examples (with `default` as one possible orgSlug):
  *   default/agents/my-agent.json           → { type: 'agents', orgSlug: 'default', slug: 'my-agent' }
  *   acme/agents/my-agent.json              → { type: 'agents', orgSlug: 'acme', slug: 'my-agent' }
@@ -29,11 +44,16 @@ const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
  *   default/integrations/slack/config.json → { type: 'integrations', orgSlug: 'default', slug: 'slack' }
  *   default/branding/branding.json         → { type: 'branding', orgSlug: 'default' }
  *   default/skills/code-reviewer/SKILL.md  → { type: 'skills', orgSlug: 'default', slug: 'code-reviewer' }
+ *   default/skills/code-reviewer/scripts/x.py → { type: 'skills', orgSlug: 'default', slug: 'code-reviewer' }
  *
  * Returns null for paths that don't fit the `<org>/<domain>/<rest>` shape
- * (org slug must validate; domain must be recognized).
+ * (org slug must validate; domain must be recognized; per-domain filter must
+ * pass; secret sidecars dropped).
  */
 function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
+  // Secret sidecars are written by operators only; never broadcast.
+  if (relativePath.endsWith('.secrets.json')) return null;
+
   const parts = relativePath.split('/');
   if (parts.length < 2) return null;
 
@@ -45,6 +65,7 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   if (domain === 'branding') {
     // Branding is default-only on the read side, but still emit per-org so
     // future per-org branding (or operator inspection) sees the event.
+    if (!relativePath.endsWith('.json')) return null;
     return { type: 'branding', orgSlug };
   }
 
@@ -63,24 +84,28 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   if (rest.length === 0) return null;
 
   if (type === 'agents') {
+    if (!relativePath.endsWith('.json')) return null;
     // <org>/agents/<name>.json
     const filename = rest[0];
     return { type, orgSlug, slug: filename.replace(/\.json$/, '') };
   }
 
   if (type === 'workflows') {
+    if (!relativePath.endsWith('.json')) return null;
     // <org>/workflows/[folder/]name.json — slug is the path without extension
     const slug = rest.join('/').replace(/\.json$/, '');
     return { type, orgSlug, slug };
   }
 
   if (type === 'integrations') {
+    if (!relativePath.endsWith('.json')) return null;
     // <org>/integrations/<slug>/config.json (or other bundle files)
     const slug = rest[0];
     return { type, orgSlug, slug };
   }
 
   if (type === 'providers') {
+    if (!relativePath.endsWith('.json')) return null;
     // <org>/providers/<name>.json
     const filename = rest[0];
     return { type, orgSlug, slug: filename.replace(/\.json$/, '') };
@@ -113,27 +138,42 @@ export function createConfigWatcher(configDir: string): ConfigWatcher {
     ],
   });
 
+  // Per-key tail debounce: collapses bursts of events for the same
+  // (type, orgSlug, slug) so a bulk operation (e.g. mass migration)
+  // doesn't fan out one SSE frame per file per connected client.
+  const pending = new Map<string, ReturnType<typeof setTimeout>>();
+
+  const emitDebounced = (event: ConfigChangeEvent) => {
+    const key = `${event.type}:${event.orgSlug ?? ''}:${event.slug ?? ''}`;
+    const existing = pending.get(key);
+    if (existing) clearTimeout(existing);
+    pending.set(
+      key,
+      setTimeout(() => {
+        pending.delete(key);
+        for (const cb of callbacks) {
+          cb(event);
+        }
+      }, EMIT_DEBOUNCE_MS),
+    );
+  };
+
   watcher.on('all', (_eventName, filePath) => {
     const rel = relative(configDir, filePath);
-
-    // Only react to JSON file changes; ignore secret sidecar files
-    if (!rel.endsWith('.json')) return;
-    if (rel.endsWith('.secrets.json')) return;
-
     const event = parseConfigChange(rel);
     if (!event) return;
-
-    for (const cb of callbacks) {
-      cb(event);
-    }
+    emitDebounced(event);
   });
 
   return {
     onChange(callback) {
       callbacks.push(callback);
     },
-    close() {
-      return watcher.close();
+    async close() {
+      // Drop any pending debounced events so we don't emit after close.
+      for (const t of pending.values()) clearTimeout(t);
+      pending.clear();
+      await watcher.close();
     },
   };
 }
diff --git a/services/platform/lib/shared/constants/reserved-org-slugs.ts b/services/platform/lib/shared/constants/reserved-org-slugs.ts
new file mode 100644
index 0000000000..e2a84342a6
--- /dev/null
+++ b/services/platform/lib/shared/constants/reserved-org-slugs.ts
@@ -0,0 +1,19 @@
+/**
+ * Org slugs that the platform reserves and refuses to assign to
+ * user-created organizations.
+ *
+ * `default` is reserved because the platform pins several global
+ * resources to it (branding, retention defaults, scaffold seed
+ * target). If a user could claim that slug they'd inherit those
+ * globals, including the ability to mutate platform branding via
+ * `isCallerAdmin` (see `convex/branding/internal_queries.ts`).
+ *
+ * Importable from both Convex (`convex/auth.ts`) and the React
+ * organization form — kept in `lib/shared/constants/` so it stays
+ * Node-runtime-neutral.
+ */
+export const RESERVED_ORG_SLUGS: ReadonlySet<string> = new Set(['default']);
+
+export function isReservedOrgSlug(slug: string): boolean {
+  return RESERVED_ORG_SLUGS.has(slug.toLowerCase());
+}
diff --git a/services/rag/app/models.py b/services/rag/app/models.py
index fa021223ef..a824f1e4a3 100644
--- a/services/rag/app/models.py
+++ b/services/rag/app/models.py
@@ -19,13 +19,16 @@ class HealthResponse(BaseModel):
 
 
 class ConfigResponse(BaseModel):
-    """Configuration response (non-sensitive values only)."""
+    """Configuration response (non-sensitive values only).
+
+    LLM-specific fields (model name, embedding model) require an
+    `org_slug` to resolve in the multi-org world and are intentionally
+    omitted from this endpoint; query the per-org config separately.
+    """
 
     host: str
     port: int
     log_level: str
-    openai_model: str
-    openai_embedding_model: str
     chunk_size: int
     chunk_overlap: int
     top_k: int
diff --git a/services/rag/app/routers/health.py b/services/rag/app/routers/health.py
index 0b5c686969..24a7a08689 100644
--- a/services/rag/app/routers/health.py
+++ b/services/rag/app/routers/health.py
@@ -13,7 +13,7 @@
 
 from typing import Any
 
-from fastapi import APIRouter, HTTPException, status
+from fastapi import APIRouter
 from loguru import logger
 
 from .. import __version__
@@ -87,22 +87,13 @@ async def get_config():
 
     Auth-gated via the protected router; before round-2 v15 this leaked
     deployment fingerprints (model names, host/port, chunking params)
-    to any caller with reach to the RAG port.
+    to any caller with reach to the RAG port. LLM/embedding model names
+    require an `org_slug` to resolve and are omitted here.
     """
-    try:
-        llm_config = settings.get_llm_config()
-    except ValueError as exc:
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail="LLM configuration not available",
-        ) from exc
-
     return ConfigResponse(
         host=settings.host,
         port=settings.port,
         log_level=settings.log_level,
-        openai_model=llm_config.get("model", ""),
-        openai_embedding_model=llm_config.get("embedding_model", ""),
         chunk_size=settings.chunk_size,
         chunk_overlap=settings.chunk_overlap,
         top_k=settings.top_k,
diff --git a/tools/cli/src/commands/deploy/index.ts b/tools/cli/src/commands/deploy/index.ts
index c7d9816dd5..b96da2b814 100644
--- a/tools/cli/src/commands/deploy/index.ts
+++ b/tools/cli/src/commands/deploy/index.ts
@@ -60,9 +60,13 @@ export function createDeployCommand(): Command {
         // (typical: a new `SANDBOX_TOKEN` for an existing deployment),
         // force-recreate the running services so their in-memory env
         // refreshes to the new value rather than keeping the stale null.
+        // Also force-recreate on --override-all so the reseed action
+        // runs against the new binary, not a stale container that the
+        // image/config-unchanged path would have left running.
         const forceRecreate =
-          regeneratedAutoSecrets !== undefined &&
-          regeneratedAutoSecrets.length > 0;
+          (regeneratedAutoSecrets !== undefined &&
+            regeneratedAutoSecrets.length > 0) ||
+          (options.overrideAll ?? false);
         const env = loadEnv(projectDir);
 
         const version = pkg.version.includes('-dev') ? 'latest' : pkg.version;
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 3166dde853..82ee5fcec0 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -145,8 +145,22 @@ export async function deploy(options: DeployOptions): Promise<void> {
       const prefix = dryRun ? '[DRY-RUN] ' : '';
       logger.header(`${prefix}Deploying Tale ${version}`);
 
-      // (Auto-migration framework removed — `tale migrate config-layout` is
-      // the only opt-in, manually-run migration now.)
+      // Auto-migration framework removed — `tale migrate config-layout` is
+      // the only opt-in, manually-run migration now. Fail fast (before
+      // pulling images / rolling services) if the project still has the
+      // pre-refactor flat layout at the root; otherwise a no-op deploy
+      // could complete while the host config silently never reaches the
+      // container.
+      {
+        const { legacyDirs } = await findOrgDirs(env.DEPLOY_DIR);
+        if (legacyDirs.length > 0) {
+          throw new Error(
+            `Legacy flat layout detected at project root (${legacyDirs.join(', ')}/). ` +
+              `Run 'tale migrate config-layout' then 'tale deploy --override-all -y' ` +
+              `(see docs/self-hosted/operate/upgrades.md).`,
+          );
+        }
+      }
 
       // Check if this is a first-time deployment
       const currentColor = await getCurrentColor(env.DEPLOY_DIR);
@@ -572,7 +586,10 @@ export async function deploy(options: DeployOptions): Promise<void> {
           `${prefix}Dry-run complete! Would deploy version ${version}`,
         );
       } else {
-        logger.success(`Deployment complete! Version ${version} is now live`);
+        // Containers are now rolled. Don't print "Deployment complete!"
+        // yet — that announces success over the wire, but sync + reseed
+        // still run below and either can abort the deploy.
+        logger.info(`${prefix}Services updated to version ${version}.`);
       }
 
       // Sync project files to the convex container (owns convex-data volume rw)
@@ -594,6 +611,10 @@ export async function deploy(options: DeployOptions): Promise<void> {
           assumeYes: options.assumeYes ?? false,
         });
       }
+
+      if (!dryRun) {
+        logger.success(`Deployment complete! Version ${version} is now live`);
+      }
     });
   } finally {
     process.removeListener('SIGINT', onInterrupt);
@@ -681,14 +702,11 @@ async function syncProjectFiles(
   const { orgDirs, legacyDirs } = await findOrgDirs(projectDir);
 
   if (legacyDirs.length > 0) {
-    logger.error(
-      `${prefix}Legacy flat layout detected at project root (${legacyDirs.join(', ')}/).`,
+    throw new Error(
+      `Legacy flat layout detected at project root (${legacyDirs.join(', ')}/). ` +
+        `Run 'tale migrate config-layout' then 'tale deploy --override-all -y' ` +
+        `(see docs/self-hosted/operate/upgrades.md).`,
     );
-    logger.info(
-      `${prefix}  Move config under 'default/<domain>/' (or run 'tale init --force' to rescaffold).`,
-    );
-    logger.info(`${prefix}  Aborting --override push.`);
-    return;
   }
 
   if (orgDirs.length === 0) {
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index 2f86fa701e..51e2e56572 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -223,6 +223,7 @@ export async function start(options: StartOptions): Promise<void> {
     { version, registry: env.GHCR_REGISTRY },
     hostAlias,
     port,
+    { projectDir },
   );
 
   const overrideFile = findComposeOverride(projectDir);

From b2db3161f133466a98ea96b33af6805d635e236e Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 15:01:17 +0800
Subject: [PATCH 04/41] fix(platform,cli,crawler,rag,docs): close P0/P1 gaps
 from second-round review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two-round multi-agent review of refactor/uniform-org-first-config-layout
surfaced 5 P0s and ~30 P1s. This commit closes all of them.

P0 — cross-tenant isolation

- workflow_engine RAG delete_document, document extract/apply_docx_structured:
  add verifyStorageIdsBelongToOrg guards (mirror compare/retrieve pattern).
- crawler vision: refactor VisionClient + process_pages_with_llm to per-org
  _org_states / _chat_states keyed by get_active_org(); previously a 15s
  TTL singleton leaked org A's API key to org B's request within the
  window. llm_cache (OCR/desc/LLM) entries are now org-scoped via
  _scoped_key so the same content from two orgs never collides.
- New test_vision_isolation.py locks the invariant down (6 cases).

P1 — abstractions and CLI/upgrade flow

- UpstreamHttpError: .message = safeMessage only (snippet kept on
  .bodySnippet so it doesn't cross the Convex client boundary as a
  default toast); parse Retry-After into retryAfterMs; endpoint
  defaults to response.url; new toConvexError() carries structured
  fields across the wire; 9 new tests covering 401/403/404/429
  carve-outs, Retry-After parsing, ConvexError marshalling.
- Migrate 8 raw `new Error` sites to UpstreamHttpError.fromResponse
  (crawler_action ×3, file_metadata, fetch_document_comparison 4xx
  paths, web fetch_and_extract, docx extract/apply structured helpers).
- delete_document re-throws retryable upstream errors instead of
  folding them into {success:false}, so action retries can recover.
- rag_search_tool search-path now mirrors retrieve-path: returns
  safe summary instead of throwing past the agent runtime.

P1 — shared utilities (dedup)

- New lib/shared/constants/org-slug.ts owns ORG_SLUG_REGEX,
  isValidOrgSlug, assertValidOrgSlug — replaces 3 inline copies
  (file_io.ts, config-watcher.ts, reseed_all_orgs.ts) and tightens
  the bash regex in migrate-config-layout/script.sh.
- lib/file_io.ts gains getConfigRoot(area?) and safeJoinWithinDir
  helpers; 6 file_utils.ts files + config_store/store.ts drop ~80
  lines of copy-pasted path-traversal guards.
- organizations/resolve_org_slug.ts now re-exports orgSlugFromId
  (single implementation across ~46 callers).

P1 — CLI upgrade flow alignment

- migrate-config-layout/script.sh: pre-scan dst-collisions, SKIP
  notices to stdout (only ERROR on stderr), invalid org slug
  surfaces as conflict+error not silent skip.
- start.ts: import LEGACY_DOMAIN_DIR_NAMES from deploy.ts (closes
  the missing 'retention' drift), hard-fail on legacy layout
  (consistent with deploy), ensureEnv unconditional (matches deploy
  semantics for auto-secret refresh).
- migrate-config-layout.ts: actionable error when convex container
  isn't running; help text says "byte-for-byte" not "sha256"
  (matches cmp -s implementation).
- Three-locale docs/upgrades.md: drop "(and other config)" overpromise,
  reflect deploy hard-fail (not "starts up empty"), document old-
  container-must-be-running prereq for step 1, fix DE/FR grammar
  ("du läufst" → "du ausführst", "neu walkst" → "neu durchgehst",
  "re-walks" → "reparcours").

P1 — reseed CLI robustness

- reseed-all-orgs.ts: line-aware trailing-JSON parser (replaces
  fragile lastIndexOf('{')); grep `|| true` so grep zero-matches
  don't poison pipefail; failure branch parses payload too so
  failed-slug detail reaches CI logs; timeout-124 exit gets a
  distinct "timed out, safe to re-run" message.
- reseed_all_orgs.ts: invalid betterAuth slugs flow into results;
  pagination guards against stuck cursor + 1000-page cap.

P1 — RAG internal concurrency

- search() returns (results, usage) tuple — drops the mutable
  self.last_search_usage singleton that mis-attributed tokens
  under concurrent calls.
- Module-level _pin_dim_lock serializes the first _pinned_dims
  write across orgs (was racing past `if dims is None`).
- _org_locks LRU-capped at 256 to bound memory if a caller ever
  sprays distinct slugs.
- shutdown() drains _background_tasks before close_pool().

P1 — crawler data correctness

- DELETE FROM chunks: add `AND domain = $2` so same URL path on
  two domains doesn't over-delete.
- delete_page_chunks now accepts optional domain arg.
- pg_website_store: parse asyncpg DELETE tag as integer (was
  literal string compare against "DELETE 0").

P1 — branding hardening

- requireBrandingAdmin: trusted-headers branch no longer short-
  circuits past isCallerAdmin's default-org check.
- safeGetUrl in getLegacyBranding now logs instead of swallowing.
- saveImage/deleteImage readdir errors: distinguish ENOENT from
  EACCES/EISDIR.
- server.ts branding route: explicit Content-Type allowlist + sep-
  bounded prefix check (defense in depth over the existing
  filename validator).

P1 — docker entrypoint + 2026-03-28 migration script

- FORCE_SEED default ("false") so script stays correct under any
  future set -u audit.
- $data_dir single source of truth — drops /app/data hardcodes
  that diverged from $TALE_CONFIG_DIR.
- chown -R replaced with `find ! -user app -exec chown app:app`
  so large volumes don't re-walk every startup.
- POSTGRES_URL parsing handles bracketed IPv6 ([::1]) and URL-
  encoded password segments (pure-bash, no python dependency).
- mkdir + atomic_cp chained with `&&` instead of `;` so a failed
  mkdir doesn't cause a misattributed copy diagnostic.
- 2026-03-28 migration: drop `2>/dev/null` on cp so I/O errors
  surface; keep `|| true` only for the empty-glob case.

P1 — file_metadata retry classification

- extractFileMetadata uses isUpstreamHttpError to distinguish
  transient (5xx/408/429 → retry) from permanent (4xx, org-slug
  lookup failure → markFailed). Earlier retried permanent errors
  N times burning scheduler slots.

P1 — auth + org form

- beforeCreateOrganization: lowercase-normalize slug BEFORE
  reservation + uniqueness checks (closes Default/default cased
  bypass); assertValidOrgSlug on entry.
- New beforeUpdateOrganization hook: same guards on rename so
  owners can't claim reserved slugs post-creation.
- organization-form.tsx: extract deriveOrgSlug helper (was
  inlined three places); route Zod refine messages through
  useT (was hardcoded English); add three-locale i18n keys.

P1 — scaffold test coverage

- Add tests for invalid-slug skipped:true return, retention
  override on/off, strict:true aggregated throw, non-strict
  aggregated result.

Verification

- bun run check: all lint + type + test suites pass.
- Platform: 274 test files, 70941 assertions green.
- Crawler: 487 tests, RAG: 298 tests.
---
 docs/de/self-hosted/operate/upgrades.md       |  22 ++-
 docs/en/self-hosted/operate/upgrades.md       |  17 +-
 docs/fr/self-hosted/operate/upgrades.md       |  22 ++-
 scripts/2026-03-28-migrate-convex-data.sh     |   8 +-
 services/convex/docker-entrypoint.sh          |  63 ++++++-
 .../crawler/app/services/indexing_service.py  |  21 ++-
 .../crawler/app/services/pg_website_store.py  |  17 +-
 services/crawler/app/services/vision/cache.py |  58 ++++--
 .../app/services/vision/openai_client.py      | 162 +++++++++++-----
 services/crawler/tests/conftest.py            |  18 ++
 .../crawler/tests/test_vision_isolation.py    | 174 ++++++++++++++++++
 .../components/organization-form.tsx          |  61 +++---
 .../settings/governance/hooks/mutations.ts    |   2 +-
 .../fetch_document_comparison.test.ts         |  10 +-
 .../documents/fetch_document_content.test.ts  |  14 +-
 .../helpers/fetch_document_comparison.ts      |  17 +-
 .../agent_tools/rag/query_rag_context.ts      |  36 +++-
 .../convex/agent_tools/rag/rag_search_tool.ts |  22 ++-
 .../web/helpers/fetch_and_extract.ts          |  17 +-
 .../web/helpers/query_web_context.ts          |   5 +-
 .../agent_tools/web/helpers/search_pages.ts   |  20 +-
 services/platform/convex/agents/file_utils.ts |  31 +---
 services/platform/convex/auth.ts              |  66 ++++++-
 .../platform/convex/branding/file_actions.ts  |  29 ++-
 .../platform/convex/branding/file_utils.ts    |  42 +----
 .../convex/branding/internal_queries.ts       |  11 +-
 .../convex/file_metadata/internal_actions.ts  |  29 ++-
 .../convex/integrations/file_utils.ts         |  31 +---
 .../platform/convex/lib/config_store/store.ts |  38 ++--
 .../__tests__/upstream_http_error.test.ts     | 110 ++++++++++-
 .../convex/lib/errors/upstream_http_error.ts  |  99 +++++++++-
 services/platform/convex/lib/file_io.ts       |  51 ++++-
 .../platform/convex/lib/helpers/rag_config.ts |  12 +-
 .../convex/organizations/reseed_all_orgs.ts   |  43 ++++-
 .../organizations/resolve_org_slug.test.ts    |   4 +-
 .../convex/organizations/resolve_org_slug.ts  |  32 +---
 .../convex/organizations/scaffold.test.ts     |  96 +++++++++-
 .../platform/convex/providers/file_utils.ts   |  44 ++---
 services/platform/convex/skills/file_utils.ts |  41 ++---
 .../action_defs/crawler/crawler_action.ts     |  28 ++-
 .../action_defs/document/document_action.ts   |  21 +++
 .../document/helpers/apply_docx_structured.ts |   8 +-
 .../helpers/extract_docx_structured.ts        |   8 +-
 .../rag/helpers/delete_document.test.ts       |  18 +-
 .../rag/helpers/delete_document.ts            |  35 +++-
 .../rag/helpers/upload_file_direct.test.ts    |   8 +-
 .../action_defs/rag/rag_action.ts             |   3 +
 .../platform/convex/workflows/file_utils.ts   |  34 ++--
 services/platform/lib/config-watcher.ts       |   4 +-
 .../platform/lib/shared/constants/org-slug.ts |  32 ++++
 services/platform/messages/de.json            |   3 +
 services/platform/messages/en.json            |   3 +
 services/platform/messages/fr.json            |   3 +
 services/platform/server.ts                   |  31 +++-
 services/rag/app/routers/search.py            |   5 +-
 services/rag/app/services/rag_service.py      |  90 ++++++---
 services/rag/tests/test_rag_service.py        |  54 ++++--
 tools/cli/src/commands/migrate.ts             |   4 +-
 tools/cli/src/lib/actions/deploy.ts           |  14 +-
 .../src/lib/actions/migrate-config-layout.ts  |  19 +-
 tools/cli/src/lib/actions/reseed-all-orgs.ts  |  95 +++++++---
 tools/cli/src/lib/actions/start.ts            |  60 +++---
 .../src/lib/migrate-config-layout/script.sh   |  49 ++++-
 63 files changed, 1668 insertions(+), 556 deletions(-)
 create mode 100644 services/crawler/tests/test_vision_isolation.py
 create mode 100644 services/platform/lib/shared/constants/org-slug.ts

diff --git a/docs/de/self-hosted/operate/upgrades.md b/docs/de/self-hosted/operate/upgrades.md
index db5eee2a08..887b37d01e 100644
--- a/docs/de/self-hosted/operate/upgrades.md
+++ b/docs/de/self-hosted/operate/upgrades.md
@@ -18,7 +18,7 @@ Zwei Dinge sind es wert, zuerst zu bestätigen:
 
 ## Die zwei Kommandos
 
-`tale upgrade` aktualisiert das CLI-Binary selbst. Die deployte Plattform-Version stimmt mit der Version des CLI überein — diese Kopplung ist Absicht, damit das CLI, das du läufst, nicht eine Version deployen kann, die es nicht kennt.
+`tale upgrade` aktualisiert das CLI-Binary selbst. Die deployte Plattform-Version stimmt mit der Version des CLI überein — diese Kopplung ist Absicht, damit das CLI, das du ausführst, nicht eine Version deployen kann, die es nicht kennt.
 
 ```bash
 # Bewege das CLI auf das letzte Release
@@ -79,18 +79,21 @@ Minor-Versionen zu überspringen (von 0.9 auf 0.11 zu gehen) ist unterstützt, s
 
 ## Wo das hingehört
 
-Der Upgrade-Flow knüpft jede andere Operate-Seite an — Backups sind das, was ein gescheitertes Upgrade wiederherstellbar macht, Observability ist das, was dir sagt, dass die neue Farbe healthy ist, Hardening ist das, was du nach einer Major-Version neu walkst. Setzt du das CLI zum ersten Mal auf, deckt [Tale-CLI installieren](/de/self-hosted/install/cli-install) das workstationseitige Setup ab; nimmst du den Pager mitten im Rollout auf, nennt [Troubleshooting](/de/self-hosted/operate/observability/troubleshooting) die Symptome.
+Der Upgrade-Flow knüpft jede andere Operate-Seite an — Backups sind das, was ein gescheitertes Upgrade wiederherstellbar macht, Observability ist das, was dir sagt, dass die neue Farbe healthy ist, Hardening ist das, was du nach einer Major-Version neu durchgehst. Setzt du das CLI zum ersten Mal auf, deckt [Tale-CLI installieren](/de/self-hosted/install/cli-install) das workstationseitige Setup ab; nimmst du den Pager mitten im Rollout auf, nennt [Troubleshooting](/de/self-hosted/operate/observability/troubleshooting) die Symptome.
 
 ## Migration auf das Org-first-Config-Layout
 
 Ältere Tale-Releases haben Config in einem flachen Baum im Workspace-Root abgelegt (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Aktuelles Tale nutzt ein **Org-first**-Layout, in dem jede Org — auch die kanonische `default` — ihren eigenen Unterbaum besitzt: `<root>/<org>/<domain>/...`. Die Migration ist opt-in und läuft einmal pro Workspace. Die neue Plattform liest die alten Pfade nicht mehr; bis du migrierst, liegen Provider-Secrets und Anpassungen in Verzeichnissen, die das Runtime nicht mehr anschaut.
 
-Die Migration sind drei Kommandos:
+Die Migration sind drei Kommandos. Für Schritt 1 muss der Convex-Container vom **alten** Image noch laufen — halt die Plattform auf der alten Version online und führe Schritt 1 gegen diesen laufenden Container aus, bevor du upgradest.
 
 ```bash
-# 1. Provider-Secrets (und andere Config) aus dem flachen Layout nach
-#    `default/<domain>/...` kopieren. cp statt mv, damit die alten Pfade
-#    für einen möglichen Rollback intakt bleiben.
+# 1. Provider-Secrets aus dem flachen Layout nach
+#    `default/providers/...` kopieren. cp statt mv, damit die alten
+#    Pfade für einen möglichen Rollback intakt bleiben. Scope sind
+#    ausschließlich Provider-Secrets; alle anderen Domains (agents,
+#    workflows, integrations, skills, branding, retention) werden in
+#    Schritt 2 server-seitig aus dem Builtin-Katalog re-seedet.
 tale migrate config-layout
 
 # 2. Convex-Container gegen das Org-first-Volume-Layout neu erstellen
@@ -100,8 +103,9 @@ tale migrate config-layout
 tale deploy --override-all -y
 
 # 3. Wenn du das neue Layout verifiziert hast, alte Pfade entfernen.
-#    sha-verifiziert, dass die neue Datei der alten entspricht, bevor
-#    unlink; bei Mismatch wird das Löschen verweigert.
+#    Verifiziert byte-für-byte, dass die neue Datei der alten
+#    entspricht, bevor unlink; bei Mismatch wird das Löschen
+#    verweigert.
 tale migrate config-layout --cleanup-old
 ```
 
@@ -117,4 +121,4 @@ Nach Schritt 3 (`--cleanup-old`) sind die alten Pfade weg. Downgrade re-seedet d
 
 ### Was, wenn ich Schritt 1 überspringe?
 
-Der Convex-Container erkennt beim Start die übrig gebliebenen flachen Layout-Dirs und schreibt eine Warnung in seine Logs, die die Verzeichnisse benennt und auf dieses Runbook zeigt. Das Deployment startet, aber Reads aus diesen Verzeichnissen liefern leer, und Writes gehen in die neuen (leeren) Org-first-Pfade. Die Korrektur sind weiterhin Schritt 1 + 2 — sie nach der Warnung laufen zu lassen funktioniert genauso wie sie im Voraus laufen zu lassen.
+`tale deploy` und `tale start` verweigern beide den Start, wenn sie übrig gebliebene flache Layout-Dirs (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) im Workspace-Root finden. Der Fehler nennt die betroffenen Verzeichnisse und verweist auf dieses Runbook. Die Korrektur sind Schritt 1 + 2 in dieser Reihenfolge; es gibt keinen "trotzdem deployen und Legacy-Pfade ignorieren"-Modus — die Runtime-Resolver lesen diese Pfade nicht, ein Boot ohne Migration würde die Plattform also mit leerer Config zurücklassen.
diff --git a/docs/en/self-hosted/operate/upgrades.md b/docs/en/self-hosted/operate/upgrades.md
index 20d8c20dee..b2c8864ecd 100644
--- a/docs/en/self-hosted/operate/upgrades.md
+++ b/docs/en/self-hosted/operate/upgrades.md
@@ -85,12 +85,15 @@ The upgrade flow ties together every other operate page — backups are what mak
 
 Older Tale releases stored config in a flat tree at the workspace root (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Current Tale uses an **org-first** layout where every org — including the canonical `default` — owns its own subtree: `<root>/<org>/<domain>/...`. The migration is opt-in and runs once per workspace. The new platform refuses to read the legacy paths; until you migrate, your provider secrets and customizations live in directories the runtime no longer looks at.
 
-The migration is three commands:
+The migration is three commands. The convex container from the **old** image must still be running for step 1 — keep the platform up on the old version, then run step 1 against that running container before upgrading.
 
 ```bash
-# 1. Copy provider secrets (and other config) from the flat layout into
-#    `default/<domain>/...`. cp not mv, so the old paths stay intact in
-#    case you need to roll back.
+# 1. Copy provider secrets from the flat layout into
+#    `default/providers/...`. cp not mv, so the old paths stay intact
+#    in case you need to roll back. Scope is provider secrets only;
+#    every other domain (agents, workflows, integrations, skills,
+#    branding, retention) is re-seeded server-side by step 2 from the
+#    builtin catalog.
 tale migrate config-layout
 
 # 2. Recreate the Convex container against the org-first volume layout
@@ -100,8 +103,8 @@ tale migrate config-layout
 tale deploy --override-all -y
 
 # 3. Once you have verified the new layout is intact, remove the legacy
-#    paths. sha-verifies that the new file matches the old before
-#    unlinking; refuses to delete on any mismatch.
+#    paths. Verifies that the new file matches the old byte-for-byte
+#    before unlinking; refuses to delete on any mismatch.
 tale migrate config-layout --cleanup-old
 ```
 
@@ -117,4 +120,4 @@ After step 3 (`--cleanup-old`), the legacy paths are gone. Downgrade still re-se
 
 ### What if I skip step 1?
 
-The Convex container will detect leftover flat-layout dirs on boot and print a warning to its logs naming the directories and pointing at this runbook. The deployment will start up, but reads from those directories return empty and writes go to the new (empty) org-first paths. The fix is still steps 1 + 2 — running them after the warning works exactly the same as running them up front.
+`tale deploy` and `tale start` both refuse to run when they detect leftover flat-layout dirs (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) at the workspace root. The error names the offending directories and points at this runbook. The fix is steps 1 + 2 in order; there is no "deploy anyway and ignore the legacy paths" mode — the runtime resolvers do not read those paths, so booting without migrating would leave the platform with empty config.
diff --git a/docs/fr/self-hosted/operate/upgrades.md b/docs/fr/self-hosted/operate/upgrades.md
index 0881491c2d..9ce64839f3 100644
--- a/docs/fr/self-hosted/operate/upgrades.md
+++ b/docs/fr/self-hosted/operate/upgrades.md
@@ -79,19 +79,22 @@ Sauter des versions mineures (passer de 0.9 à 0.11) est supporté tant que les
 
 ## Où cela s'inscrit
 
-Le flow de montée de version noue chaque autre page d'exploitation — les backups sont ce qui rend une montée de version échouée récupérable, l'observabilité est ce qui te dit que la nouvelle couleur est saine, le durcissement est ce que tu re-walks après une version majeure. Si tu mets en place la CLI pour la première fois, [Installer la CLI tale](/fr/self-hosted/install/cli-install) couvre le setup côté workstation ; si tu prends le pager en plein rollout, [Dépannage](/fr/self-hosted/operate/observability/troubleshooting) nomme les symptômes.
+Le flow de montée de version noue chaque autre page d'exploitation — les backups sont ce qui rend une montée de version échouée récupérable, l'observabilité est ce qui te dit que la nouvelle couleur est saine, le durcissement est ce que tu reparcours après une version majeure. Si tu mets en place la CLI pour la première fois, [Installer la CLI tale](/fr/self-hosted/install/cli-install) couvre le setup côté workstation ; si tu prends le pager en plein rollout, [Dépannage](/fr/self-hosted/operate/observability/troubleshooting) nomme les symptômes.
 
 ## Migration vers la disposition de config org-first
 
 Les anciennes versions de Tale stockaient la config dans une arborescence plate à la racine du workspace (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). La version actuelle utilise une disposition **org-first** où chaque org — y compris la canonique `default` — possède son propre sous-arbre : `<root>/<org>/<domain>/...`. La migration est opt-in et tourne une seule fois par workspace. La nouvelle plateforme refuse de lire les anciens chemins ; tant que tu n'as pas migré, tes secrets de provider et personnalisations vivent dans des répertoires que le runtime ne regarde plus.
 
-La migration tient en trois commandes :
+La migration tient en trois commandes. Pour l'étape 1, le conteneur Convex de l'**ancienne** image doit encore tourner — garde la plateforme en ligne sur l'ancienne version et lance l'étape 1 contre ce conteneur en cours avant de monter de version.
 
 ```bash
-# 1. Copier les secrets de provider (et autres configs) depuis la
-#    disposition plate vers `default/<domain>/...`. cp et non mv, donc
-#    les anciens chemins restent intacts au cas où un rollback serait
-#    nécessaire.
+# 1. Copier les secrets de provider depuis la disposition plate vers
+#    `default/providers/...`. cp et non mv, donc les anciens chemins
+#    restent intacts au cas où un rollback serait nécessaire. Le scope
+#    couvre uniquement les secrets de provider ; tous les autres
+#    domaines (agents, workflows, integrations, skills, branding,
+#    retention) sont re-seedés côté serveur à l'étape 2 depuis le
+#    catalogue builtin.
 tale migrate config-layout
 
 # 2. Recréer le conteneur Convex contre la disposition de volume org-first
@@ -100,8 +103,9 @@ tale migrate config-layout
 tale deploy --override-all -y
 
 # 3. Une fois la nouvelle disposition vérifiée intacte, supprimer les
-#    anciens chemins. Vérifie via sha que le nouveau fichier correspond
-#    à l'ancien avant unlink ; refuse de supprimer en cas de mismatch.
+#    anciens chemins. Vérifie byte-à-byte que le nouveau fichier
+#    correspond à l'ancien avant unlink ; refuse de supprimer en cas
+#    de mismatch.
 tale migrate config-layout --cleanup-old
 ```
 
@@ -117,4 +121,4 @@ Après l'étape 3 (`--cleanup-old`), les anciens chemins sont partis. Le downgra
 
 ### Et si je saute l'étape 1 ?
 
-Le conteneur Convex détectera au démarrage les répertoires restants de la disposition plate et écrira un warning dans ses logs en nommant les répertoires et pointant vers ce runbook. Le déploiement démarre, mais les reads sur ces répertoires reviennent vides et les writes vont vers les nouveaux chemins (vides) org-first. La correction reste étapes 1 + 2 — les lancer après le warning fonctionne exactement comme les lancer en amont.
+`tale deploy` et `tale start` refusent tous les deux de démarrer s'ils détectent des répertoires restants de la disposition plate (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) à la racine du workspace. L'erreur nomme les répertoires concernés et pointe vers ce runbook. La correction reste les étapes 1 + 2 dans cet ordre ; il n'existe pas de mode « déploie quand même et ignore les chemins legacy » — les résolveurs runtime ne lisent pas ces chemins, donc démarrer sans migrer laisserait la plateforme avec une config vide.
diff --git a/scripts/2026-03-28-migrate-convex-data.sh b/scripts/2026-03-28-migrate-convex-data.sh
index 7ad741cf29..b377e605d7 100755
--- a/scripts/2026-03-28-migrate-convex-data.sh
+++ b/scripts/2026-03-28-migrate-convex-data.sh
@@ -72,7 +72,13 @@ if [ "$old_exists" = true ]; then
         mkdir -p "$dst"
 
         before=$(ls "$dst" 2>/dev/null | wc -l)
-        cp -rn "$src/"* "$dst/" 2>/dev/null || true
+        # `cp -rn` is no-clobber, so re-runs are no-ops on already-
+        # copied trees. Earlier this swallowed stderr unconditionally,
+        # which hid disk-full / permission-denied as "0 new items".
+        # `|| true` is kept only to tolerate the "no files to copy"
+        # edge case (matched glob with no entries) without aborting
+        # `set -e`; real I/O errors now surface on stderr.
+        cp -rn "$src/"* "$dst/" || true
         after=$(ls "$dst" | wc -l)
         added=$((after - before))
 
diff --git a/services/convex/docker-entrypoint.sh b/services/convex/docker-entrypoint.sh
index 5bc69b5923..19924b1fb6 100755
--- a/services/convex/docker-entrypoint.sh
+++ b/services/convex/docker-entrypoint.sh
@@ -6,6 +6,13 @@
 # noise it would catch.
 set -eo pipefail
 
+# Default for the seed-force flag. The script references `$FORCE_SEED`
+# in several places (`[ "$FORCE_SEED" = "true" ]`); without this
+# default it works only because `set -u` is intentionally off — any
+# future audit that enables nounset would break startup. Pin the
+# default here so the script stays correct under both modes.
+FORCE_SEED="${FORCE_SEED:-false}"
+
 # ============================================================================
 # Tale Convex Service Entrypoint
 # ----------------------------------------------------------------------------
@@ -44,7 +51,13 @@ if [ "$(id -u)" = '0' ]; then
   # org seed target) up front; per-domain dirs are created on-demand by
   # `run_seed` and `scaffoldNewOrganization`.
   mkdir -p "$data_dir/convex" "$data_dir/default"
-  chown -R app:app "$data_dir"
+  # Only chown files NOT already owned by `app:app`. On large volumes
+  # (RAG uploads, Convex storage) the prior unconditional `chown -R`
+  # walked every inode every boot, adding tens of seconds and racing
+  # with backend writes during fast restart loops. `find ... -exec
+  # chown {} +` is idempotent and short-circuits once the volume is
+  # consistent.
+  find "$data_dir" \! -user app -exec chown app:app {} +
 
   # ----------------------------------------------------------------------------
   # SSRF egress firewall (defense-in-depth)
@@ -240,8 +253,32 @@ wait_for_http() {
 }
 
 # Extract DB host:port from POSTGRES_URL for a TCP probe.
-db_host=$(echo "$POSTGRES_URL" | sed -E 's#^postgres(ql)?://([^@/]+@)?([^:/?]+).*#\3#')
-db_port=$(echo "$POSTGRES_URL" | sed -nE 's#^postgres(ql)?://([^@/]+@)?[^:/?]+:([0-9]+).*#\3#p')
+#
+# Strip the scheme + optional `user:pass@` userinfo (greedy match up to
+# the LAST `@`, which handles passwords containing `@` correctly), then
+# special-case the bracketed IPv6 form `[::1]:5432` before falling
+# through to the bare `host:port` form.
+hostport="${POSTGRES_URL#*://}"
+case "$hostport" in
+  *@*) hostport="${hostport##*@}" ;;
+esac
+hostport="${hostport%%/*}"
+hostport="${hostport%%\?*}"
+case "$hostport" in
+  '['*']'*)
+    db_host="${hostport#[}"; db_host="${db_host%%]*}"
+    tail="${hostport#*]}"
+    db_port="${tail#:}"
+    ;;
+  *:*)
+    db_host="${hostport%%:*}"
+    db_port="${hostport##*:}"
+    ;;
+  *)
+    db_host="$hostport"
+    db_port=""
+    ;;
+esac
 db_port="${db_port:-5432}"
 if [ -n "$db_host" ]; then
   wait_for_port "$db_host" "$db_port" 60 "PostgreSQL" || exit 1
@@ -250,8 +287,13 @@ fi
 # ============================================================================
 # Prepare working directories
 # ============================================================================
-mkdir -p /app/data/convex
-export TMPDIR=/app/data/convex/tmp
+# Single source of truth — every path below derives from `data_dir` so
+# an operator who sets `TALE_CONFIG_DIR` to a non-default mount gets
+# consistent behavior. Previously this block hardcoded `/app/data/...`
+# despite the root-priv chown loop above respecting TALE_CONFIG_DIR.
+data_dir="${TALE_CONFIG_DIR:-/app/data}"
+mkdir -p "$data_dir/convex"
+export TMPDIR="$data_dir/convex/tmp"
 mkdir -p "$TMPDIR"
 
 # Orphan video-link tmp dirs from crashed/killed ingest_video_link.ts actions.
@@ -287,8 +329,8 @@ fi
 # an older binary that doesn't recognize this marker re-seeds (idempotently)
 # into its expected old paths on a hypothetical downgrade.
 # ----------------------------------------------------------------------------
-seed_marker="/app/data/.seeded-${TALE_VERSION:-dev}-orgfirst"
-data_dir="/app/data"
+seed_marker="$data_dir/.seeded-${TALE_VERSION:-dev}-orgfirst"
+# `data_dir` already set above (single source of truth); no re-assign.
 
 # Crash-safe file copy: write to a sibling tmp file then rename to dest.
 # `cp` itself is non-atomic; the value is that an interrupted run leaves
@@ -345,13 +387,16 @@ run_seed() {
       local history_dir="$workflows_dir/.history/$flat_slug"
 
       if [ "$FORCE_SEED" = "true" ]; then
-        mkdir -p "$dest_dir"; atomic_cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path (forced)"; continue
+        # `&&` (not `;`) so a failed mkdir aborts the copy attempt
+        # — otherwise atomic_cp runs against a missing dir and the
+        # diagnostic attributes the fault to the copy.
+        mkdir -p "$dest_dir" && atomic_cp "$src" "$dest" && echo "   ✓ Seeded workflow $rel_path (forced)"; continue
       fi
       if [ -f "$dest" ]; then echo "   ⏭ Skipping workflow $rel_path (already exists)"; continue; fi
       if [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
         echo "   ⏭ Skipping workflow $rel_path (user has modifications in .history)"; continue
       fi
-      mkdir -p "$dest_dir"; atomic_cp "$src" "$dest"; echo "   ✓ Seeded workflow $rel_path"
+      mkdir -p "$dest_dir" && atomic_cp "$src" "$dest" && echo "   ✓ Seeded workflow $rel_path"
     done
   fi
 
diff --git a/services/crawler/app/services/indexing_service.py b/services/crawler/app/services/indexing_service.py
index b3f1cc67d2..131dc36cf3 100644
--- a/services/crawler/app/services/indexing_service.py
+++ b/services/crawler/app/services/indexing_service.py
@@ -162,7 +162,11 @@ async def _hash_update(conn: asyncpg.Connection) -> tuple[dict[str, int], asyncp
 
         async def _store_chunks(conn: asyncpg.Connection) -> None:
             await conn.execute(_UPSERT_WEBSITE_URL, domain, url, title, content_hash, filtered_hash)
-            await conn.execute("DELETE FROM chunks WHERE url = $1", url)
+            # Scope by domain too: chunks PK is (domain, url, chunk_index)
+            # so two different domains hosting the same URL path
+            # (e.g. `/about`) would over-delete each other's chunks
+            # without this filter.
+            await conn.execute("DELETE FROM chunks WHERE domain = $1 AND url = $2", domain, url)
             for i in range(0, len(chunk_rows), _EXECUTEMANY_BATCH_SIZE):
                 await conn.executemany(_chunk_insert, chunk_rows[i : i + _EXECUTEMANY_BATCH_SIZE])
 
@@ -250,8 +254,19 @@ async def _index_one(row: asyncpg.Record) -> dict:
             "total_chunks": total_chunks,
         }
 
-    async def delete_page_chunks(self, url: str) -> int:
+    async def delete_page_chunks(self, url: str, domain: str | None = None) -> int:
+        # `domain` is optional for backwards compatibility — existing
+        # callers that don't pass it get the wider (URL-only) delete.
+        # New callers should pass it so two domains sharing a path
+        # don't over-delete each other's chunks.
         async with acquire_with_retry(self._pool) as conn:
-            result = await conn.execute("DELETE FROM chunks WHERE url = $1", url)
+            if domain is None:
+                result = await conn.execute("DELETE FROM chunks WHERE url = $1", url)
+            else:
+                result = await conn.execute(
+                    "DELETE FROM chunks WHERE domain = $1 AND url = $2",
+                    domain,
+                    url,
+                )
             count = int(result.split()[-1]) if result else 0
             return count
diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py
index 75ee0e8ded..8a675ad6e3 100644
--- a/services/crawler/app/services/pg_website_store.py
+++ b/services/crawler/app/services/pg_website_store.py
@@ -352,8 +352,21 @@ async def begin_delete(self, domain: str, org_slug: str) -> dict:
                 domain,
                 org_slug,
             )
-            # asyncpg returns "DELETE N" as the tag; "DELETE 0" means no row matched.
-            removed_membership = deleted != "DELETE 0"
+            # asyncpg returns "DELETE N" as the documented command tag.
+            # Parse the integer rather than comparing the literal string
+            # so a future tag-format change (e.g. extra whitespace, OID
+            # column on older Postgres) doesn't silently flip the flag.
+            try:
+                removed_membership = int(deleted.rsplit(" ", 1)[-1]) > 0
+            except (ValueError, AttributeError):
+                # Defensive — should be unreachable given asyncpg's
+                # contract — but failing loud is better than silently
+                # mis-classifying.
+                logger.warning(
+                    "[begin_delete] unexpected command tag from asyncpg: %r",
+                    deleted,
+                )
+                removed_membership = False
             remaining = await conn.fetchval(
                 "SELECT COUNT(*) FROM website_org_memberships WHERE domain = $1",
                 domain,
diff --git a/services/crawler/app/services/vision/cache.py b/services/crawler/app/services/vision/cache.py
index 3c1cecd0d8..40425169b7 100644
--- a/services/crawler/app/services/vision/cache.py
+++ b/services/crawler/app/services/vision/cache.py
@@ -5,9 +5,12 @@
 content multiple times.
 
 Cache strategy:
-- Uses SHA-256 hash as cache key (image bytes for vision, text for LLM)
+- Uses SHA-256 hash + active org slug as cache key
 - In-memory LRU cache for fast access (O(1) operations via OrderedDict)
 - Separate caches for OCR, image description, and LLM processing results
+- Cache entries are scoped per org: two orgs hitting the same input do
+  NOT share cached output (different providers/prompts could yield
+  different results, and the result text itself may be sensitive).
 """
 
 import hashlib
@@ -15,6 +18,8 @@
 
 from loguru import logger
 
+from app.org_context import get_active_org
+
 CACHE_SIZE = 5000
 
 
@@ -28,6 +33,17 @@ def compute_text_hash(text: str) -> str:
     return hashlib.sha256(text.encode()).hexdigest()
 
 
+def _scoped_key(content_hash: str) -> str:
+    """Prepend active org slug to a content hash so cache entries do not
+    leak between orgs.
+
+    The org slug is required for any cache lookup; if it cannot be
+    resolved (caller forgot to set the ContextVar) `get_active_org`
+    raises and the caller never gets a cross-org hit by accident.
+    """
+    return f"{get_active_org()}:{content_hash}"
+
+
 class LlmCache:
     """Cache for Vision API results.
 
@@ -46,47 +62,61 @@ def _evict_if_needed(self, cache: OrderedDict[str, str]) -> None:
 
     def get_ocr(self, image_bytes: bytes) -> tuple[str | None, str]:
         """Get cached OCR result."""
-        image_hash = compute_image_hash(image_bytes)
+        image_hash = _scoped_key(compute_image_hash(image_bytes))
         if image_hash in self._ocr_cache:
             self._ocr_cache.move_to_end(image_hash)
-            logger.debug(f"Cache HIT (OCR): {image_hash[:16]}...")
+            logger.debug(f"Cache HIT (OCR): {image_hash[:24]}...")
             return self._ocr_cache[image_hash], image_hash
         return None, image_hash
 
     def set_ocr(self, image_hash: str, result: str) -> None:
-        """Store OCR result in cache."""
+        """Store OCR result in cache.
+
+        `image_hash` must be the value returned by `get_ocr` (already
+        org-scoped).
+        """
         self._evict_if_needed(self._ocr_cache)
         self._ocr_cache[image_hash] = result
         self._ocr_cache.move_to_end(image_hash)
 
     def get_description(self, image_bytes: bytes) -> tuple[str | None, str]:
         """Get cached image description."""
-        image_hash = compute_image_hash(image_bytes)
+        image_hash = _scoped_key(compute_image_hash(image_bytes))
         if image_hash in self._description_cache:
             self._description_cache.move_to_end(image_hash)
-            logger.debug(f"Cache HIT (description): {image_hash[:16]}...")
+            logger.debug(f"Cache HIT (description): {image_hash[:24]}...")
             return self._description_cache[image_hash], image_hash
         return None, image_hash
 
     def set_description(self, image_hash: str, result: str) -> None:
-        """Store image description in cache."""
+        """Store image description in cache.
+
+        `image_hash` must be the value returned by `get_description`.
+        """
         self._evict_if_needed(self._description_cache)
         self._description_cache[image_hash] = result
         self._description_cache.move_to_end(image_hash)
 
     def get_llm(self, cache_key: str) -> str | None:
-        """Get cached LLM processing result."""
-        if cache_key in self._llm_cache:
-            self._llm_cache.move_to_end(cache_key)
-            logger.debug(f"Cache HIT (LLM): {cache_key[:16]}...")
-            return self._llm_cache[cache_key]
+        """Get cached LLM processing result.
+
+        `cache_key` is treated as caller-supplied content; the active
+        org slug is prepended internally so the same `(chunk, prompt,
+        model)` tuple from two orgs never collides.
+        """
+        scoped = _scoped_key(cache_key)
+        if scoped in self._llm_cache:
+            self._llm_cache.move_to_end(scoped)
+            logger.debug(f"Cache HIT (LLM): {scoped[:24]}...")
+            return self._llm_cache[scoped]
         return None
 
     def set_llm(self, cache_key: str, result: str) -> None:
         """Store LLM processing result in cache."""
+        scoped = _scoped_key(cache_key)
         self._evict_if_needed(self._llm_cache)
-        self._llm_cache[cache_key] = result
-        self._llm_cache.move_to_end(cache_key)
+        self._llm_cache[scoped] = result
+        self._llm_cache.move_to_end(scoped)
 
     def get_stats(self) -> dict[str, int]:
         """Get cache statistics."""
diff --git a/services/crawler/app/services/vision/openai_client.py b/services/crawler/app/services/vision/openai_client.py
index e6c0cba81c..0f46a89f39 100644
--- a/services/crawler/app/services/vision/openai_client.py
+++ b/services/crawler/app/services/vision/openai_client.py
@@ -77,6 +77,36 @@ def _detect_mime_type(image_bytes: bytes) -> str:
 Be extremely concise - omit minor details."""
 
 
+_CONFIG_CHECK_INTERVAL = 15  # seconds
+
+
+class _OrgVisionState:
+    __slots__ = ("client", "config", "last_check")
+
+    def __init__(
+        self,
+        client: AsyncOpenAI,
+        config: tuple,
+        last_check: float,
+    ) -> None:
+        self.client = client
+        self.config = config
+        self.last_check = last_check
+
+
+# Per-org cached AsyncOpenAI clients for vision config. Keyed by org slug
+# so two orgs' requests never share `_client` / `_client_config` (which
+# would route org B's traffic through org A's API key when within the
+# TTL — the bug this refactor fixes).
+_vision_states: dict[str, _OrgVisionState] = {}
+
+# Same shape for chat config (used by `process_pages_with_llm`). Two
+# orgs may legitimately have different chat providers; without an
+# explicit per-org cache, the prior code rebuilt the client on every
+# call and leaked the httpx pool.
+_chat_states: dict[str, _OrgVisionState] = {}
+
+
 async def _safe_close_client(client: AsyncOpenAI) -> None:
     """Close an old client after a grace period for in-flight requests."""
     await asyncio.sleep(30)
@@ -86,50 +116,93 @@ async def _safe_close_client(client: AsyncOpenAI) -> None:
         logger.opt(exception=True).warning("Failed to close old vision client")
 
 
-class VisionClient:
-    """Async client for OpenAI Vision API calls."""
-
-    _CONFIG_CHECK_INTERVAL = 15  # seconds
-
-    def __init__(self) -> None:
-        self._client: AsyncOpenAI | None = None
-        self._client_config: tuple | None = None
-        self._last_config_check: float = 0
+def _get_or_build_client(
+    states: dict[str, _OrgVisionState],
+    org_slug: str,
+    config_getter,
+    *,
+    timeout: float,
+    label: str,
+) -> AsyncOpenAI:
+    """Look up or build the per-org AsyncOpenAI client.
+
+    Mirrors `embedding_service.get_embedding_service` so behavior is
+    consistent across crawler services:
+      - Within TTL: return cached client without re-reading config.
+      - Config read fails: keep the existing client; never silently
+        downgrade to an empty key.
+      - Config changed: build a new client, schedule the old one to
+        close after a grace period so in-flight calls finish.
+    """
+    state = states.get(org_slug)
+    now = time.monotonic()
+    if state is not None and (now - state.last_check) < _CONFIG_CHECK_INTERVAL:
+        return state.client
 
-    def _get_client(self) -> AsyncOpenAI:
-        """Get or create the OpenAI client, rebuilding if config changed."""
-        now = time.monotonic()
-        if self._client is not None and (now - self._last_config_check) < self._CONFIG_CHECK_INTERVAL:
-            return self._client
+    try:
+        config = config_getter(org_slug)  # (base_url, api_key, model)
+    except (ValueError, OSError):
+        if state is not None:
+            logger.opt(exception=True).warning(
+                "Config read failed for org '{}', keeping current {} client",
+                org_slug,
+                label,
+            )
+            state.last_check = now
+            return state.client
+        raise
+
+    if state is not None and config == state.config:
+        state.last_check = now
+        return state.client
+
+    base_url, api_key, model = config
+
+    # Never downgrade to empty key
+    if not api_key and state is not None:
+        logger.warning(
+            "Skipping {} reload for org '{}': new config has empty API key",
+            label,
+            org_slug,
+        )
+        state.last_check = now
+        return state.client
+
+    old_client = state.client if state is not None else None
+    new_client = AsyncOpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
+    states[org_slug] = _OrgVisionState(
+        client=new_client,
+        config=config,
+        last_check=now,
+    )
 
-        self._last_config_check = now
-        try:
-            config = settings.get_vision_config(get_active_org())  # (base_url, api_key, model)
-        except (ValueError, OSError):
-            if self._client is not None:
-                logger.opt(exception=True).warning("Config read failed, keeping current vision client")
-                return self._client
-            raise
+    if old_client is not None:
+        logger.info("{} rebuilt for org '{}': model={}", label, org_slug, model)
+        with contextlib.suppress(RuntimeError):
+            asyncio.get_running_loop().create_task(_safe_close_client(old_client))
+    else:
+        logger.info("{} created for org '{}': model={}", label, org_slug, model)
 
-        if config == self._client_config and self._client is not None:
-            return self._client
+    return new_client
 
-        base_url, api_key, _model = config
 
-        # Never downgrade to empty key
-        if not api_key and self._client is not None:
-            logger.warning("Skipping vision client reload: new config has empty API key")
-            return self._client
+class VisionClient:
+    """Async client for OpenAI Vision API calls.
 
-        old = self._client
-        self._client = AsyncOpenAI(api_key=api_key, base_url=base_url, timeout=120.0)
-        self._client_config = config
+    Stateless wrapper: per-org AsyncOpenAI instances live in the
+    module-level `_vision_states` dict, looked up on every call via
+    `get_active_org()`. This prevents the previous singleton from
+    handing org A's client to org B's request inside the TTL window.
+    """
 
-        if old is not None:
-            logger.info("Vision client rebuilt: model={}", _model)
-            with contextlib.suppress(RuntimeError):
-                asyncio.get_running_loop().create_task(_safe_close_client(old))
-        return self._client
+    def _get_client(self) -> AsyncOpenAI:
+        return _get_or_build_client(
+            _vision_states,
+            get_active_org(),
+            settings.get_vision_config,
+            timeout=120.0,
+            label="vision client",
+        )
 
     async def ocr_image(
         self,
@@ -370,13 +443,18 @@ async def process_pages_with_llm(
 
     logger.info(f"LLM processing: {total_chars} chars total, chunking at {max_chars_per_chunk} chars")
 
-    base_url, api_key, chat_model = settings.get_chat_config(get_active_org())
-    client = AsyncOpenAI(
-        api_key=api_key,
-        base_url=base_url,
+    org_slug = get_active_org()
+    client = _get_or_build_client(
+        _chat_states,
+        org_slug,
+        settings.get_chat_config,
         timeout=300.0,
+        label="chat client",
     )
-    resolved_model = model or chat_model
+    # `resolved_model` is read from the freshly-cached config to ensure it
+    # matches the client we just got back from the per-org cache.
+    cached_chat_model = _chat_states[org_slug].config[2]
+    resolved_model = model or cached_chat_model
     semaphore = asyncio.Semaphore(max_concurrent)
 
     chunks = _chunk_by_chars(full_text, max_chars_per_chunk)
diff --git a/services/crawler/tests/conftest.py b/services/crawler/tests/conftest.py
index 55be3f34c3..4e700ec368 100644
--- a/services/crawler/tests/conftest.py
+++ b/services/crawler/tests/conftest.py
@@ -26,4 +26,22 @@ def _bind_test_active_org() -> Iterator[None]:
         _active_org.reset(token)
 
 
+@pytest.fixture(autouse=True)
+def _reset_vision_state() -> Iterator[None]:
+    """Clear per-org AsyncOpenAI caches in vision/openai_client so a
+    mock patched in test A does not get reused by test B."""
+    from app.services.vision.cache import llm_cache
+    from app.services.vision.openai_client import _chat_states, _vision_states
+
+    _chat_states.clear()
+    _vision_states.clear()
+    llm_cache.clear()
+    try:
+        yield
+    finally:
+        _chat_states.clear()
+        _vision_states.clear()
+        llm_cache.clear()
+
+
 __all__ = ["_bind_test_active_org", "set_active_org"]
diff --git a/services/crawler/tests/test_vision_isolation.py b/services/crawler/tests/test_vision_isolation.py
new file mode 100644
index 0000000000..36945bc3cc
--- /dev/null
+++ b/services/crawler/tests/test_vision_isolation.py
@@ -0,0 +1,174 @@
+"""Cross-org isolation for the vision pipeline.
+
+Two regression suites:
+
+1. `VisionClient._get_client` (and the chat-config variant used by
+   `process_pages_with_llm`) must NOT reuse another org's
+   `AsyncOpenAI` instance. Earlier code held a single module-level
+   client + config tuple, so within a 15s TTL org B's request would
+   reuse org A's API key + base_url.
+
+2. `llm_cache` (OCR / description / LLM) must NOT serve org A's
+   cached output to org B. Earlier code keyed the cache by
+   `sha256(content)` only.
+
+These tests bypass the autouse `test-org` binding via `set_active_org`
+to simulate two distinct orgs landing on the same shared crawler
+process.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.org_context import set_active_org
+from app.services.vision.cache import llm_cache
+from app.services.vision.openai_client import (
+    VisionClient,
+    _chat_states,
+    _vision_states,
+    process_pages_with_llm,
+)
+
+
+class TestVisionClientPerOrg:
+    @patch("app.services.vision.openai_client.settings")
+    @patch("app.services.vision.openai_client.AsyncOpenAI")
+    def test_two_orgs_get_separate_clients(self, mock_openai_cls: MagicMock, mock_settings: MagicMock) -> None:
+        # Each org sees its own provider config.
+        configs = {
+            "org-a": ("https://a.example", "key-a", "model-a"),
+            "org-b": ("https://b.example", "key-b", "model-b"),
+        }
+        mock_settings.get_vision_config.side_effect = lambda slug: configs[slug]
+
+        # Two distinct AsyncOpenAI instances on the two constructor calls.
+        client_a = MagicMock(name="client_a")
+        client_b = MagicMock(name="client_b")
+        mock_openai_cls.side_effect = [client_a, client_b]
+
+        client = VisionClient()
+
+        set_active_org("org-a")
+        first = client._get_client()
+        set_active_org("org-b")
+        second = client._get_client()
+
+        assert first is client_a
+        assert second is client_b
+        assert _vision_states["org-a"].client is client_a
+        assert _vision_states["org-b"].client is client_b
+
+        # The constructor was called with each org's own api_key — proving
+        # the singleton-reuse leak is gone.
+        kwargs_seen = [call.kwargs for call in mock_openai_cls.call_args_list]
+        assert {kw["api_key"] for kw in kwargs_seen} == {"key-a", "key-b"}
+
+    @patch("app.services.vision.openai_client.settings")
+    @patch("app.services.vision.openai_client.AsyncOpenAI")
+    def test_org_a_request_does_not_get_org_b_client_within_ttl(
+        self, mock_openai_cls: MagicMock, mock_settings: MagicMock
+    ) -> None:
+        configs = {
+            "org-a": ("https://a.example", "key-a", "model-a"),
+            "org-b": ("https://b.example", "key-b", "model-b"),
+        }
+        mock_settings.get_vision_config.side_effect = lambda slug: configs[slug]
+
+        client_a = MagicMock(name="client_a")
+        client_b = MagicMock(name="client_b")
+        mock_openai_cls.side_effect = [client_a, client_b]
+
+        client = VisionClient()
+        set_active_org("org-a")
+        client._get_client()
+        # Org B in the same process, even right after org A — must build
+        # its own client, not reuse the cached one.
+        set_active_org("org-b")
+        result = client._get_client()
+        assert result is client_b
+
+
+class TestProcessPagesWithLlmPerOrg:
+    @pytest.mark.asyncio
+    @patch("app.services.vision.openai_client.settings")
+    @patch("app.services.vision.openai_client.AsyncOpenAI")
+    async def test_two_orgs_each_build_their_own_chat_client(
+        self, mock_openai_cls: MagicMock, mock_settings: MagicMock
+    ) -> None:
+        configs = {
+            "org-a": ("https://a.example", "key-a", "chat-a"),
+            "org-b": ("https://b.example", "key-b", "chat-b"),
+        }
+        mock_settings.get_chat_config.side_effect = lambda slug: configs[slug]
+
+        # Two distinct AsyncOpenAI instances; each one returns a tiny
+        # canned chat completion.
+        def make_client(label: str) -> MagicMock:
+            client = AsyncMock(name=f"client_{label}")
+            response = MagicMock()
+            response.choices = [MagicMock()]
+            response.choices[0].message.content = f"out-{label}"
+            response.usage = None
+            client.chat.completions.create = AsyncMock(return_value=response)
+            return client
+
+        client_a = make_client("a")
+        client_b = make_client("b")
+        mock_openai_cls.side_effect = [client_a, client_b]
+
+        set_active_org("org-a")
+        out_a = await process_pages_with_llm(["hello"], "extract")
+        set_active_org("org-b")
+        out_b = await process_pages_with_llm(["hello"], "extract")
+
+        assert out_a == ["out-a"]
+        assert out_b == ["out-b"]
+        # Each org built its own AsyncOpenAI with its own api_key.
+        api_keys = [call.kwargs["api_key"] for call in mock_openai_cls.call_args_list]
+        assert set(api_keys) == {"key-a", "key-b"}
+        assert _chat_states["org-a"].client is client_a
+        assert _chat_states["org-b"].client is client_b
+
+
+class TestLlmCacheOrgIsolation:
+    def test_ocr_cache_miss_across_orgs(self) -> None:
+        image = b"PNG-like-bytes"
+        set_active_org("org-a")
+        _, hash_a = llm_cache.get_ocr(image)
+        llm_cache.set_ocr(hash_a, "text from A")
+        assert llm_cache.get_ocr(image)[0] == "text from A"
+
+        # Same image bytes, different org: must miss.
+        set_active_org("org-b")
+        cached, hash_b = llm_cache.get_ocr(image)
+        assert cached is None
+        assert hash_a != hash_b
+
+    def test_description_cache_miss_across_orgs(self) -> None:
+        image = b"another-image"
+        set_active_org("org-a")
+        _, hash_a = llm_cache.get_description(image)
+        llm_cache.set_description(hash_a, "desc from A")
+        assert llm_cache.get_description(image)[0] == "desc from A"
+
+        set_active_org("org-b")
+        cached, _ = llm_cache.get_description(image)
+        assert cached is None
+
+    def test_llm_cache_miss_across_orgs(self) -> None:
+        set_active_org("org-a")
+        llm_cache.set_llm("shared-key", "result A")
+        assert llm_cache.get_llm("shared-key") == "result A"
+
+        set_active_org("org-b")
+        assert llm_cache.get_llm("shared-key") is None
+        llm_cache.set_llm("shared-key", "result B")
+        # Org B's value, not org A's.
+        assert llm_cache.get_llm("shared-key") == "result B"
+
+        # And org A still sees its own value.
+        set_active_org("org-a")
+        assert llm_cache.get_llm("shared-key") == "result A"
diff --git a/services/platform/app/features/organization/components/organization-form.tsx b/services/platform/app/features/organization/components/organization-form.tsx
index 559c2a1a7b..0dc1d23938 100644
--- a/services/platform/app/features/organization/components/organization-form.tsx
+++ b/services/platform/app/features/organization/components/organization-form.tsx
@@ -26,6 +26,25 @@ import { useInitializeDefaultWorkflows } from '../hooks/actions';
 
 type FormData = { name: string };
 
+/**
+ * Derive the on-disk slug from a free-form display name.
+ *
+ * Three call sites used to inline the same chain; the helper keeps the
+ * derivation rule in one place so the live preview, the Zod refine,
+ * and the submit payload can never drift.
+ *
+ * Must produce a slug that matches
+ * `services/platform/lib/shared/constants/org-slug.ts` ORG_SLUG_REGEX —
+ * see `assertValidOrgSlug`.
+ */
+function deriveOrgSlug(name: string): string {
+  return name
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '');
+}
+
 export function OrganizationForm() {
   const navigate = useNavigate();
   const queryClient = useQueryClient();
@@ -36,11 +55,11 @@ export function OrganizationForm() {
   const { t } = useT('settings');
   const { t: tCommon } = useT('common');
 
-  // slug is derived from name via lowercasing + replacing non-alphanumerics
-  // with hyphens; it's used as a filesystem path component (/examples/{slug}/)
-  // and must match file_io.ts ORG_SLUG_REGEX: /^[a-z0-9][a-z0-9_-]*$/.
-  // So the name must contain at least one ASCII letter or digit; pure-CJK or
-  // pure-symbol names would produce an empty slug and fail at creation.
+  // slug is derived from name via `deriveOrgSlug`; it's used as a
+  // filesystem path component (`$TALE_CONFIG_DIR/<slug>/...`) and
+  // must match the canonical ORG_SLUG_REGEX. Pure-CJK / pure-symbol
+  // names would produce an empty slug and fail at creation; the
+  // regex check below rejects them up front.
   const formSchema = useMemo(
     () =>
       z.object({
@@ -49,19 +68,11 @@ export function OrganizationForm() {
           .min(1, t('organization.companyNameRequired'))
           .regex(
             /^[A-Za-z0-9][A-Za-z0-9 _-]*$/,
-            'Use letters, digits, spaces, hyphens, and underscores only, starting with a letter or digit.',
+            t('organization.companyNameCharacterError'),
           )
-          .refine(
-            (name) => {
-              const derived = name
-                .trim()
-                .toLowerCase()
-                .replace(/[^a-z0-9]+/g, '-')
-                .replace(/^-+|-+$/g, '');
-              return !isReservedOrgSlug(derived);
-            },
-            { message: 'This name is reserved by the platform.' },
-          ),
+          .refine((name) => !isReservedOrgSlug(deriveOrgSlug(name)), {
+            message: t('organization.nameReserved'),
+          }),
       }),
     [t],
   );
@@ -75,11 +86,7 @@ export function OrganizationForm() {
   });
 
   const nameValue = form.watch('name');
-  const slugPreview = nameValue
-    .trim()
-    .toLowerCase()
-    .replace(/[^a-z0-9]+/g, '-')
-    .replace(/^-+|-+$/g, '');
+  const slugPreview = deriveOrgSlug(nameValue);
 
   const { mutateAsync: initializeDefaultWorkflows } =
     useInitializeDefaultWorkflows();
@@ -90,11 +97,7 @@ export function OrganizationForm() {
     }
 
     try {
-      const slug = data.name
-        .trim()
-        .toLowerCase()
-        .replace(/[^a-z0-9]+/g, '-')
-        .replace(/^-+|-+$/g, '');
+      const slug = deriveOrgSlug(data.name);
 
       const result = await authClient.organization.create({
         name: data.name.trim(),
@@ -165,7 +168,9 @@ export function OrganizationForm() {
               disabled={form.formState.isSubmitting}
               errorMessage={form.formState.errors.name?.message}
               description={
-                slugPreview ? `Identifier: ${slugPreview}` : undefined
+                slugPreview
+                  ? t('organization.identifierPreview', { slug: slugPreview })
+                  : undefined
               }
             />
             <Button
diff --git a/services/platform/app/features/settings/governance/hooks/mutations.ts b/services/platform/app/features/settings/governance/hooks/mutations.ts
index 601f894769..367c49268b 100644
--- a/services/platform/app/features/settings/governance/hooks/mutations.ts
+++ b/services/platform/app/features/settings/governance/hooks/mutations.ts
@@ -21,7 +21,7 @@ export function useCancelPendingDsarPolicyChange() {
 /**
  * Retention is the one policy type that can't go through the generic
  * `upsertPolicy` mutation: bounds validation needs to read the per-org
- * file at `$TALE_CONFIG_DIR/retention/{orgSlug}.json`, which only the
+ * file at `$TALE_CONFIG_DIR/<orgSlug>/retention.json`, which only the
  * Node-side action layer can do. The V8 action wrapper validates and
  * then calls an internal mutation for the actual write.
  */
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
index 1e880b455e..632c1513a3 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
@@ -205,7 +205,7 @@ describe('fetchDocumentComparison', () => {
     );
   });
 
-  it('throws on RAG 404', async () => {
+  it('throws UpstreamHttpError "not found" on RAG 404', async () => {
     globalThis.fetch = Object.assign(
       vi
         .fn()
@@ -213,12 +213,14 @@ describe('fetchDocumentComparison', () => {
       { preconnect: vi.fn() },
     );
 
+    // safeMessageFor maps 404 to a "returned not found" summary; the
+    // upstream body lives only on `.bodySnippet`.
     await expect(
       fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
-    ).rejects.toThrow('Document not found during comparison');
+    ).rejects.toThrow(/not found/);
   });
 
-  it('throws on RAG 400', async () => {
+  it('throws UpstreamHttpError with HTTP 400 summary on RAG 400', async () => {
     globalThis.fetch = Object.assign(
       vi
         .fn()
@@ -230,7 +232,7 @@ describe('fetchDocumentComparison', () => {
 
     await expect(
       fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
-    ).rejects.toThrow('Invalid comparison request');
+    ).rejects.toThrow(/HTTP 400/);
   });
 
   it('throws with status on RAG 500', async () => {
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
index a5282d378a..5e9475c278 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
@@ -202,13 +202,23 @@ describe('fetchDocumentContent', () => {
     await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(/HTTP 500/);
   });
 
-  it('includes error body text in non-ok error message', async () => {
+  it('throws an UpstreamHttpError shaped from the response on non-ok', async () => {
     globalThis.fetch = Object.assign(
       vi.fn().mockResolvedValue(new Response('Rate limited', { status: 429 })),
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow('Rate limited');
+    // `.message` carries the safe summary (status + endpoint); the
+    // raw "Rate limited" body now lives only on `.bodySnippet`.
+    const err = await fetchDocumentContent(FILE_ID).then(
+      () => null,
+      (e: unknown) => e,
+    );
+    expect(err).toBeInstanceOf(Error);
+    expect((err as Error).message).toMatch(/HTTP 429|throttling/);
+    expect((err as { bodySnippet?: string }).bodySnippet).toMatch(
+      /Rate limited/,
+    );
   });
 
   it('wraps non-JSON response parse error', async () => {
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
index abe10b45c2..e62e18fdd1 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
@@ -130,18 +130,11 @@ export async function fetchDocumentComparison(
       timeoutMs: FETCH_TIMEOUT_MS,
     });
 
-    if (response.status === 404) {
-      const errorText = await response.text().catch(() => '');
-      throw new Error(
-        `Document not found during comparison: ${errorText || 'unknown document'}`,
-      );
-    }
-
-    if (response.status === 400) {
-      const errorText = await response.text().catch(() => '');
-      throw new Error(`Invalid comparison request: ${errorText}`);
-    }
-
+    // All non-2xx paths now route through UpstreamHttpError so the
+    // (potentially body-embedded) upstream error text gets sanitized
+    // and truncated. The status-specific messaging is already encoded
+    // in `safeMessageFor` (404 → "returned not found", 4xx → "returned
+    // HTTP …", 5xx → "is unavailable").
     if (!response.ok) {
       const errorText = await response.text().catch(() => '');
       throw UpstreamHttpError.fromResponse(
diff --git a/services/platform/convex/agent_tools/rag/query_rag_context.ts b/services/platform/convex/agent_tools/rag/query_rag_context.ts
index ea10b46eb6..23dd44321e 100644
--- a/services/platform/convex/agent_tools/rag/query_rag_context.ts
+++ b/services/platform/convex/agent_tools/rag/query_rag_context.ts
@@ -11,6 +11,7 @@
 
 import { fetchJson } from '../../../lib/utils/type-cast-helpers';
 import { createDebugLog } from '../../lib/debug_log';
+import { UpstreamHttpError } from '../../lib/errors/upstream_http_error';
 import { getRagConfig, ragFetch } from '../../lib/helpers/rag_config';
 import {
   extractCitationsFromSearchResults,
@@ -133,7 +134,8 @@ export interface RagContextOptions {
   /**
    * Org slug for the X-Tale-Org header. Required by the RAG service's
    * `/api/v1/search` endpoint (it picks the org's provider catalog to
-   * embed the query). Omitting will yield HTTP 400.
+   * embed the query). Empty / missing yields HTTP 400 from RAG and is
+   * surfaced (not silently swallowed) so the caller sees the bug.
    */
   orgSlug: string;
 }
@@ -156,7 +158,11 @@ export async function queryRagContext(
   similarityThreshold: number = DEFAULT_SIMILARITY_THRESHOLD,
   signal?: AbortSignal,
   recentMessages?: RecentMessage[],
-  options?: RagContextOptions,
+  // Required: callers must always pass `orgSlug` (and usually fileIds).
+  // Previously this was `options?: RagContextOptions`, which made the
+  // declared-required `orgSlug` field reachable as `undefined` at
+  // runtime — a type-vs-runtime mismatch that this signature fixes.
+  options: RagContextOptions = { orgSlug: '' },
 ): Promise<RagContextResult | undefined> {
   try {
     const ragServiceUrl = getRagConfig().serviceUrl;
@@ -188,7 +194,7 @@ export async function queryRagContext(
         include_metadata: true,
       };
 
-      if (!options?.fileIds || options.fileIds.length === 0) {
+      if (!options.fileIds || options.fileIds.length === 0) {
         debugLog('No file IDs provided, skipping RAG query');
         // Without this, the controller fires `controller.abort()`
         // RAG_REQUEST_TIMEOUT_MS later against no in-flight fetch — a
@@ -203,19 +209,35 @@ export async function queryRagContext(
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify(requestPayload),
-        orgSlug: options?.orgSlug,
+        orgSlug: options.orgSlug,
         signal: fetchSignal,
       });
 
       clearTimeout(timeoutId);
 
       if (!response.ok) {
-        const errorText = await response.text();
-        console.error('[rag_query] RAG service error', {
+        const errorText = await response.text().catch(() => '');
+        // 4xx is a caller/config bug: missing or bad X-Tale-Org,
+        // schema-rejected query, etc. Surfacing as a thrown
+        // UpstreamHttpError gives the agent runtime a clear signal
+        // (and prevents the agent from treating "auth misconfigured"
+        // as "knowledge base is empty"). 5xx remains silent fallback
+        // — RAG outage shouldn't break chat completely.
+        if (response.status >= 400 && response.status < 500) {
+          throw UpstreamHttpError.fromResponse(
+            'rag',
+            response,
+            errorText,
+            '/api/v1/search',
+          );
+        }
+        console.error('[rag_query] RAG service unavailable', {
           status: response.status,
+          // errorText logged engineer-side only; caller gets a graceful
+          // empty-context return so chat continues without RAG.
           error: errorText,
         });
-        return undefined; // Gracefully degrade if RAG is unavailable
+        return undefined;
       }
 
       const result = await fetchJson<SearchResponse>(response);
diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
index aa0e834aa4..e427d576e8 100644
--- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts
+++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
@@ -21,7 +21,10 @@ import { fetchJson } from '../../../lib/utils/type-cast-helpers';
 import { internal } from '../../_generated/api';
 import { stripReservedPromptTags } from '../../lib/agent_response/sanitize_prompt';
 import { createDebugLog } from '../../lib/debug_log';
-import { UpstreamHttpError } from '../../lib/errors/upstream_http_error';
+import {
+  isUpstreamHttpError,
+  UpstreamHttpError,
+} from '../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../lib/helpers/org_slug';
 import { ragFetch } from '../../lib/helpers/rag_config';
 import { toId } from '../../lib/type_cast_helpers';
@@ -575,6 +578,23 @@ RESPONSE (list_indexed):
           }),
         };
       } catch (error) {
+        // Mirror the retrieve path (see line 297) — return the safe
+        // summary instead of throwing so the agent can recover with a
+        // user-presentable message. Throwing here used to propagate
+        // `error.message` (which once contained the unsanitized body
+        // snippet) into the agent runtime and onward to the UI toast.
+        if (isUpstreamHttpError(error)) {
+          console.error('[tool:rag_search] upstream error', {
+            query: args.query,
+            status: error.status,
+            endpoint: error.endpoint,
+            safeMessage: error.safeMessage,
+            // Engineer-only: include the scrubbed body excerpt in logs
+            // so triage still has the upstream's reason.
+            bodySnippet: error.bodySnippet,
+          });
+          return { success: false, response: error.safeMessage };
+        }
         console.error('[tool:rag_search] error', {
           query: args.query,
           error: error instanceof Error ? error.message : String(error),
diff --git a/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts b/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
index 4e63e05221..9b497a024c 100644
--- a/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
+++ b/services/platform/convex/agent_tools/web/helpers/fetch_and_extract.ts
@@ -9,6 +9,7 @@ import type { ToolCtx } from '@convex-dev/agent';
 
 import { fetchJson } from '../../../../lib/utils/type-cast-helpers';
 import { createDebugLog } from '../../../lib/debug_log';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { getCrawlerServiceUrl } from './get_crawler_service_url';
 import type { WebFetchUrlResult, WebFetchExtractApiResponse } from './types';
@@ -30,7 +31,6 @@ export async function fetchAndExtract(
   if (!ctx.organizationId) {
     throw new Error('fetch_and_extract requires organizationId in ToolCtx.');
   }
-  const orgSlug = await orgSlugFromId(ctx, ctx.organizationId);
 
   debugLog('tool:web:fetch_and_extract start', {
     url: args.url,
@@ -38,6 +38,12 @@ export async function fetchAndExtract(
   });
 
   try {
+    // Resolve the slug INSIDE the try so a lookup failure folds into
+    // the same `{ success: false, error }` shape every other failure
+    // path returns. Earlier this happened outside the try, which
+    // threw raw Error past the tool's contract.
+    const orgSlug = await orgSlugFromId(ctx, ctx.organizationId);
+
     const controller = new AbortController();
     const timeoutId = setTimeout(() => controller.abort(), 300_000);
 
@@ -58,8 +64,13 @@ export async function fetchAndExtract(
     clearTimeout(timeoutId);
 
     if (!response.ok) {
-      const errorText = await response.text();
-      throw new Error(`Crawler service error: ${response.status} ${errorText}`);
+      const errorText = await response.text().catch(() => '');
+      throw UpstreamHttpError.fromResponse(
+        'crawler',
+        response,
+        errorText,
+        '/api/v1/web/fetch-and-extract',
+      );
     }
 
     const result = await fetchJson<WebFetchExtractApiResponse>(response);
diff --git a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
index ae155c7e52..3bf2a75905 100644
--- a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
+++ b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts
@@ -64,8 +64,11 @@ export async function queryWebContext(
   query: string,
   limit = DEFAULT_LIMIT,
 ): Promise<WebContextResult | undefined> {
-  const orgSlug = await orgSlugFromId(ctx, organizationId);
   try {
+    // Resolve the slug INSIDE the try so an org-lookup failure folds
+    // into the documented `undefined`-on-failure contract instead of
+    // throwing past the caller (`generate_response.ts`).
+    const orgSlug = await orgSlugFromId(ctx, organizationId);
     debugLog('Querying web context', {
       query: query.slice(0, 100),
       limit,
diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
index 24cb608dda..78ba77e891 100644
--- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts
+++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
@@ -9,6 +9,7 @@ import type { ToolCtx } from '@convex-dev/agent';
 
 import { internal } from '../../../_generated/api';
 import { createDebugLog } from '../../../lib/debug_log';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { formatWebResults } from './format_web_results';
 import { formatWebsiteSummaries } from './format_website_summaries';
@@ -65,7 +66,13 @@ async function fetchSearch(
   });
 
   if (!response.ok) {
-    throw new Error(`Search API returned ${response.status}`);
+    const errorText = await response.text().catch(() => '');
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      endpoint.replace(crawlerUrl, ''),
+    );
   }
 
   return response.json();
@@ -141,9 +148,9 @@ export async function searchPages(
   if (!results || results.length === 0) {
     debugLog('web:search_pages no results', { query: args.query });
 
-    const summaryText = ctx.organizationId
-      ? await formatWebsiteSummaries(ctx, ctx.organizationId)
-      : undefined;
+    // `ctx.organizationId` was already asserted truthy at line 123
+    // (the throw above), so the ternary's `: undefined` arm is dead.
+    const summaryText = await formatWebsiteSummaries(ctx, ctx.organizationId);
 
     if (summaryText) {
       return {
@@ -195,9 +202,8 @@ export async function searchPages(
   }));
 
   if (domainFallback) {
-    const summaryText = ctx.organizationId
-      ? await formatWebsiteSummaries(ctx, ctx.organizationId)
-      : undefined;
+    // ctx.organizationId is asserted truthy at line 123 — dead ternary removed.
+    const summaryText = await formatWebsiteSummaries(ctx, ctx.organizationId);
     const availableNote = summaryText
       ? `\n\nAvailable websites in the knowledge base:\n${summaryText}`
       : '';
diff --git a/services/platform/convex/agents/file_utils.ts b/services/platform/convex/agents/file_utils.ts
index b6ea26927c..08269e5801 100644
--- a/services/platform/convex/agents/file_utils.ts
+++ b/services/platform/convex/agents/file_utils.ts
@@ -13,7 +13,13 @@ import {
   agentJsonSchema,
   type SkillBindingResolvedEntry,
 } from '../../lib/shared/schemas/agents';
-import { serializeJson, sha256, validateOrgSlug } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  serializeJson,
+  sha256,
+  validateOrgSlug,
+} from '../lib/file_io';
 import { validateAgentName } from './validators';
 
 export { sha256, validateAgentName };
@@ -125,21 +131,11 @@ export function parseAgentJson(content: string): AgentJsonConfig {
   return result.data;
 }
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error(
-    'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set it to the root config directory ' +
-      '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
-  );
-}
-
 export function resolveAgentsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  return path.join(getConfigRoot(), orgSlug, 'agents');
+  return path.join(getConfigRoot('agents'), orgSlug, 'agents');
 }
 
 export function resolveAgentFilePath(
@@ -149,16 +145,7 @@ export function resolveAgentFilePath(
   if (!validateAgentName(agentName)) {
     throw new Error(`Invalid agent name: ${agentName}`);
   }
-  const dir = resolveAgentsDir(orgSlug);
-  const resolved = path.resolve(dir, `${agentName}.json`);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${agentName}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(resolveAgentsDir(orgSlug), `${agentName}.json`);
 }
 
 export function resolveHistoryDir(orgSlug: string, agentName: string): string {
diff --git a/services/platform/convex/auth.ts b/services/platform/convex/auth.ts
index 37b5102648..8158ba349b 100644
--- a/services/platform/convex/auth.ts
+++ b/services/platform/convex/auth.ts
@@ -12,6 +12,7 @@ import {
   ownerAc,
 } from 'better-auth/plugins/organization/access';
 
+import { assertValidOrgSlug } from '../lib/shared/constants/org-slug';
 import { isReservedOrgSlug } from '../lib/shared/constants/reserved-org-slugs';
 import { isRecord, getString } from '../lib/utils/type-guards';
 import { components, internal } from './_generated/api';
@@ -576,13 +577,24 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
           beforeCreateOrganization: async (data) => {
             const slug = data.organization.slug;
             if (!slug) return;
+            // Normalize to lowercase BEFORE both the reservation and
+            // uniqueness checks. Convex `eq` is byte-equal, so without
+            // normalization a caller could pass `Default` to bypass
+            // the reservation set (which lowercases) while also
+            // bypassing the unique-slug `eq` lookup (case-sensitive).
+            const normalizedSlug = slug.toLowerCase();
+            // Reject anything that doesn't fit the canonical slug shape
+            // so users can't smuggle invalid filesystem characters or
+            // length-cap-busting strings past the auth boundary.
+            assertValidOrgSlug(normalizedSlug);
+
             // Refuse reserved slugs ("default") that the platform pins
             // global resources to (branding, retention defaults).
             // Without this, an open-signup user could claim "default"
             // before the platform seed runs and inherit branding-admin.
             // Exception: the platform's own first-run seed creates
             // `default` when no orgs exist yet — let that one through.
-            if (isReservedOrgSlug(slug)) {
+            if (isReservedOrgSlug(normalizedSlug)) {
               const anyOrg = await ctx.runQuery(
                 components.betterAuth.adapter.findMany,
                 {
@@ -593,7 +605,7 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
               );
               if (anyOrg && anyOrg.page.length > 0) {
                 throw new APIError('BAD_REQUEST', {
-                  message: `Organization slug "${slug}" is reserved by the platform.`,
+                  message: `Organization slug "${normalizedSlug}" is reserved by the platform.`,
                 });
               }
             }
@@ -603,14 +615,60 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
               components.betterAuth.adapter.findOne,
               {
                 model: 'organization',
-                where: [{ field: 'slug', value: slug, operator: 'eq' }],
+                where: [
+                  { field: 'slug', value: normalizedSlug, operator: 'eq' },
+                ],
               },
             );
             if (existing) {
               throw new APIError('BAD_REQUEST', {
-                message: `Organization slug "${slug}" is already taken.`,
+                message: `Organization slug "${normalizedSlug}" is already taken.`,
+              });
+            }
+            // Project the normalized slug back so the persisted row
+            // matches what the checks just used. If the field is
+            // read-only on `data.organization`, this is a defensive
+            // no-op — callers are still expected to submit lowercase.
+            try {
+              data.organization.slug = normalizedSlug;
+            } catch {
+              /* read-only field — caller-supplied slug stands */
+            }
+          },
+          beforeUpdateOrganization: async (data) => {
+            // Re-run the create-time guards on update: without this
+            // hook, an org owner could rename their org to a reserved
+            // slug after creation and inherit branding-admin. Pulled
+            // through a `Record<string, unknown>` view so the field
+            // shape matches Better Auth's loose update payload type.
+            const orgPatch = data.organization as Record<string, unknown>;
+            const rawSlug = orgPatch.slug;
+            if (typeof rawSlug !== 'string') return;
+            const normalizedSlug = rawSlug.toLowerCase();
+            assertValidOrgSlug(normalizedSlug);
+            if (isReservedOrgSlug(normalizedSlug)) {
+              throw new APIError('BAD_REQUEST', {
+                message: `Organization slug "${normalizedSlug}" is reserved by the platform.`,
+              });
+            }
+            const collision = await ctx.runQuery(
+              components.betterAuth.adapter.findOne,
+              {
+                model: 'organization',
+                where: [
+                  { field: 'slug', value: normalizedSlug, operator: 'eq' },
+                ],
+              },
+            );
+            if (collision) {
+              throw new APIError('BAD_REQUEST', {
+                message: `Organization slug "${normalizedSlug}" is already taken.`,
               });
             }
+            // Project the normalized slug back onto the loose patch
+            // shape; assignment is safe whether or not Better Auth
+            // ends up re-validating it server-side.
+            orgPatch.slug = normalizedSlug;
           },
           afterCreateOrganization: async (data) => {
             const slug = data.organization.slug;
diff --git a/services/platform/convex/branding/file_actions.ts b/services/platform/convex/branding/file_actions.ts
index 5eb6dd94cc..542678ce02 100644
--- a/services/platform/convex/branding/file_actions.ts
+++ b/services/platform/convex/branding/file_actions.ts
@@ -58,11 +58,14 @@ async function requireBrandingAdmin(ctx: ActionCtx): Promise<void> {
   if (!authUser) throw new Error('Unauthenticated');
 
   const trustedData = await getTrustedAuthData(ctx);
-  if (trustedData) {
-    if (!isAdmin(trustedData.trustedRole)) {
-      throw new Error('Only admins can modify branding');
-    }
-    return;
+  // Trusted-headers mode: `trustedRole` is a JWT claim sourced from an
+  // upstream IdP. A user marked as admin in SOME org (or globally)
+  // would previously short-circuit past the per-org membership check
+  // and mutate global branding. Branding is pinned to the `default`
+  // org's admin set, so the trusted-role fast-fail must still defer
+  // to `isCallerAdmin` for the actual membership lookup.
+  if (trustedData && !isAdmin(trustedData.trustedRole)) {
+    throw new Error('Only admins can modify branding');
   }
 
   const isUserAdmin = await ctx.runQuery(
@@ -232,7 +235,9 @@ export const saveImage = action({
     const imagesDir = resolveImagesDir('default');
     await mkdir(imagesDir, { recursive: true });
 
-    // Remove any existing file for this image type (may have different extension)
+    // Remove any existing file for this image type (may have different
+    // extension). Tolerate ENOENT (first-write); log everything else
+    // so permission/IO bugs don't leak stale image files unnoticed.
     try {
       const existing = await readdir(imagesDir);
       for (const entry of existing) {
@@ -240,8 +245,10 @@ export const saveImage = action({
           await unlink(path.join(imagesDir, entry));
         }
       }
-    } catch {
-      // Directory may not exist yet
+    } catch (err) {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn(`[saveImage] readdir ${imagesDir} failed:`, err);
+      }
     }
 
     const filePath = resolveImagePath('default', filename);
@@ -271,8 +278,10 @@ export const deleteImage = action({
           await unlink(path.join(imagesDir, entry));
         }
       }
-    } catch {
-      // Directory may not exist
+    } catch (err) {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn(`[deleteImage] readdir ${imagesDir} failed:`, err);
+      }
     }
 
     return null;
diff --git a/services/platform/convex/branding/file_utils.ts b/services/platform/convex/branding/file_utils.ts
index f9ce6d5266..da2b009061 100644
--- a/services/platform/convex/branding/file_utils.ts
+++ b/services/platform/convex/branding/file_utils.ts
@@ -13,7 +13,13 @@ import {
   brandingJsonSchema,
   type BrandingJsonConfig,
 } from '../../lib/shared/schemas/branding';
-import { serializeJson, sha256, validateOrgSlug } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  serializeJson,
+  sha256,
+  validateOrgSlug,
+} from '../lib/file_io';
 
 export type { BrandingJsonConfig };
 
@@ -36,16 +42,6 @@ export type BrandingReadResult =
       message: string;
     };
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error(
-    'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set it to the root config directory ' +
-      '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
-  );
-}
-
 /**
  * Resolve the branding directory for an organization. Org-first:
  * `${TALE_CONFIG_DIR}/<orgSlug>/branding/`. Read-side currently hardcodes
@@ -56,20 +52,11 @@ export function resolveBrandingDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  return path.join(getConfigRoot(), orgSlug, 'branding');
+  return path.join(getConfigRoot('branding'), orgSlug, 'branding');
 }
 
 export function resolveBrandingFilePath(orgSlug: string): string {
-  const dir = resolveBrandingDir(orgSlug);
-  const resolved = path.resolve(dir, BRANDING_FILE_NAME);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error('Path traversal detected');
-  }
-  return resolved;
+  return safeJoinWithinDir(resolveBrandingDir(orgSlug), BRANDING_FILE_NAME);
 }
 
 export function resolveHistoryDir(orgSlug: string): string {
@@ -135,16 +122,7 @@ export function resolveImagePath(orgSlug: string, filename: string): string {
   if (!validateImageFilename(filename)) {
     throw new Error(`Invalid image filename: ${filename}`);
   }
-  const dir = resolveImagesDir(orgSlug);
-  const resolved = path.resolve(dir, filename);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error('Path traversal detected');
-  }
-  return resolved;
+  return safeJoinWithinDir(resolveImagesDir(orgSlug), filename);
 }
 
 export { ALLOWED_IMAGE_EXTENSIONS, MAX_FILE_SIZE_BYTES, MAX_HISTORY_ENTRIES };
diff --git a/services/platform/convex/branding/internal_queries.ts b/services/platform/convex/branding/internal_queries.ts
index 6c1269ce33..45d2cf5efa 100644
--- a/services/platform/convex/branding/internal_queries.ts
+++ b/services/platform/convex/branding/internal_queries.ts
@@ -59,7 +59,16 @@ export const getLegacyBranding = internalQuery({
       try {
         const url = await ctx.storage.getUrl(storageId);
         return url ? toPublicUrl(url) : null;
-      } catch {
+      } catch (error) {
+        // Symmetry with `getBindingsWithUrls.safeGetUrl` below — surface
+        // the storage-resolve failure at warn level instead of silently
+        // returning null. An empty catch here used to hide stale
+        // storage references that would have been visible in logs.
+        console.warn(
+          '[Branding] legacy storage URL resolve failed',
+          storageId,
+          error,
+        );
         return null;
       }
     }
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index 2fb5e85fa4..9c62488688 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -7,6 +7,10 @@ import { isRecord, getNumber } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
 import { getCrawlerUrl } from '../documents/generate_document_helpers';
+import {
+  isUpstreamHttpError,
+  UpstreamHttpError,
+} from '../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
 
@@ -133,8 +137,11 @@ export const extractFileMetadata = internalAction({
 
         if (!metadataResponse.ok) {
           const errorText = await metadataResponse.text().catch(() => '');
-          throw new Error(
-            `Crawler extract-metadata returned ${metadataResponse.status}: ${errorText}`,
+          throw UpstreamHttpError.fromResponse(
+            'crawler',
+            metadataResponse,
+            errorText,
+            `/api/v1/${ext}/extract-metadata`,
           );
         }
 
@@ -187,11 +194,25 @@ export const extractFileMetadata = internalAction({
         }
       } catch (error) {
         const message = error instanceof Error ? error.message : String(error);
+        // Classify the failure: only schedule retries when the upstream
+        // (crawler) reported a status the abstraction marks retryable
+        // (5xx / 408 / 429). 4xx classes (org-slug lookup failure,
+        // missing file, malformed payload) are permanent — retrying
+        // burns scheduler slots without progress.
+        const isRetryable = isUpstreamHttpError(error) && error.retryable;
+        // Non-UpstreamHttpError throws (e.g. network reset, blob fetch
+        // failure before we even hit the crawler) are also assumed
+        // transient — we have no signal otherwise.
+        const isTransient = isRetryable || !isUpstreamHttpError(error);
         console.error(
-          `[extractFileMetadata] Error for file ${args.storageId} (attempt ${attempt}): ${message}`,
+          `[extractFileMetadata] Error for file ${args.storageId} (attempt ${attempt}, transient=${isTransient}): ${message}`,
         );
 
-        if (attempt < EXTRACT_METADATA_RETRY_DELAYS.length) {
+        if (!isTransient) {
+          console.warn(
+            `[extractFileMetadata] Permanent failure for file ${args.storageId}; not retrying: ${message}`,
+          );
+        } else if (attempt < EXTRACT_METADATA_RETRY_DELAYS.length) {
           const retryDelay = EXTRACT_METADATA_RETRY_DELAYS[attempt];
           await ctx.scheduler.runAfter(
             retryDelay,
diff --git a/services/platform/convex/integrations/file_utils.ts b/services/platform/convex/integrations/file_utils.ts
index e9fde32f0e..d541e10136 100644
--- a/services/platform/convex/integrations/file_utils.ts
+++ b/services/platform/convex/integrations/file_utils.ts
@@ -13,7 +13,13 @@ import {
   integrationJsonSchema,
   type IntegrationJsonConfig,
 } from '../../lib/shared/schemas/integrations';
-import { serializeJson, sha256, validateOrgSlug } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  serializeJson,
+  sha256,
+  validateOrgSlug,
+} from '../lib/file_io';
 
 export { sha256 };
 
@@ -44,16 +50,6 @@ export function validateIntegrationSlug(slug: string): boolean {
   return INTEGRATION_SLUG_REGEX.test(slug);
 }
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error(
-    'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set it to the root config directory ' +
-      '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
-  );
-}
-
 /**
  * Resolve the integrations directory for an organization. Org-first:
  * `${TALE_CONFIG_DIR}/<orgSlug>/integrations/`.
@@ -62,7 +58,7 @@ export function resolveIntegrationsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  return path.join(getConfigRoot(), orgSlug, 'integrations');
+  return path.join(getConfigRoot('integrations'), orgSlug, 'integrations');
 }
 
 /**
@@ -72,16 +68,7 @@ export function resolveIntegrationDir(orgSlug: string, slug: string): string {
   if (!validateIntegrationSlug(slug)) {
     throw new Error(`Invalid integration slug: ${slug}`);
   }
-  const dir = resolveIntegrationsDir(orgSlug);
-  const resolved = path.resolve(dir, slug);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${slug}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(resolveIntegrationsDir(orgSlug), slug);
 }
 
 export function resolveConfigPath(orgSlug: string, slug: string): string {
diff --git a/services/platform/convex/lib/config_store/store.ts b/services/platform/convex/lib/config_store/store.ts
index 1cbb51c6af..8a54ea91f7 100644
--- a/services/platform/convex/lib/config_store/store.ts
+++ b/services/platform/convex/lib/config_store/store.ts
@@ -31,7 +31,13 @@ import path from 'node:path';
 
 import type { z } from 'zod/v4';
 
-import { atomicWrite, readJsonFile, validateOrgSlug } from '../file_io';
+import {
+  atomicWrite,
+  getConfigRoot,
+  readJsonFile,
+  safeJoinWithinDir,
+  validateOrgSlug,
+} from '../file_io';
 
 const MAX_FILE_SIZE_BYTES = 256 * 1024;
 
@@ -48,17 +54,19 @@ export interface ConfigStore<T> {
   list(): Promise<Array<{ orgSlug: string }>>;
 }
 
-function getConfigRoot(area: string): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (!configDir) {
+// Restrict `area` to a flat lowercase identifier so it cannot contain
+// path separators / `..` and silently escape the per-org subdir. This is
+// a factory-time invariant: callers of `createFileConfigStore` hard-code
+// the area string (e.g. `'retention'`), so any failure here is a
+// developer-time misuse, never user-supplied input.
+const AREA_REGEX = /^[a-z][a-z0-9_-]*$/;
+
+function assertValidArea(area: string): void {
+  if (!AREA_REGEX.test(area)) {
     throw new Error(
-      `TALE_CONFIG_DIR environment variable is not set. ` +
-        `Set TALE_CONFIG_DIR in .env to the root config directory ` +
-        `(e.g., TALE_CONFIG_DIR=/path/to/tale/examples) so ${area} ` +
-        `can be resolved.`,
+      `Invalid config_store area "${area}". Must match ${AREA_REGEX.source}.`,
     );
   }
-  return configDir;
 }
 
 function resolveFilePath(area: string, orgSlug: string): string {
@@ -67,16 +75,7 @@ function resolveFilePath(area: string, orgSlug: string): string {
   }
   const root = getConfigRoot(area);
   const dir = path.join(root, orgSlug);
-  const fileName = `${area}.json`;
-  const resolved = path.resolve(dir, fileName);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${orgSlug}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(dir, `${area}.json`);
 }
 
 /**
@@ -88,6 +87,7 @@ export function createFileConfigStore<T>(
   area: string,
   schema: z.ZodType<T>,
 ): ConfigStore<T> {
+  assertValidArea(area);
   const parse = (content: string): T => {
     // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- raw JSON before Zod validation
     const parsed = JSON.parse(content) as unknown;
diff --git a/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
index 1e784fb7f1..c805781c37 100644
--- a/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
+++ b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
@@ -1,14 +1,31 @@
+import { ConvexError } from 'convex/values';
 import { describe, expect, it } from 'vitest';
 
 import {
   isRetryableStatus,
   isUpstreamHttpError,
+  parseRetryAfterMs,
   UpstreamHttpError,
 } from '../upstream_http_error';
 
-function makeResponse(status: number): Response {
-  // Minimal Response stand-in — UpstreamHttpError.fromResponse only reads `.status`.
-  return new Response(null, { status });
+function makeResponse(
+  status: number,
+  init: { headers?: Record<string, string>; url?: string } = {},
+): Response {
+  // UpstreamHttpError.fromResponse reads .status, .headers, and (when
+  // endpoint is omitted) .url. Response's `url` is read-only on
+  // construction, so simulate it via a thin proxy that lets us override.
+  const res = new Response(null, {
+    status,
+    headers: init.headers,
+  });
+  if (init.url !== undefined) {
+    Object.defineProperty(res, 'url', {
+      value: init.url,
+      configurable: true,
+    });
+  }
+  return res;
 }
 
 describe('UpstreamHttpError', () => {
@@ -24,8 +41,10 @@ describe('UpstreamHttpError', () => {
     expect(err.bodySnippet).not.toMatch(/sk-abcdefgh/);
     expect(err.bodySnippet).not.toMatch(/Bearer\s+sk-/);
     expect(err.bodySnippet).toMatch(/REDACTED/);
-    // Engineer-facing message still embeds the (now-scrubbed) snippet for triage.
-    expect(err.message).toMatch(/REDACTED/);
+    // `.message` carries safeMessage only — snippet stays out so it
+    // does not cross the Convex client boundary as a default toast.
+    expect(err.message).not.toMatch(/REDACTED/);
+    expect(err.message).toBe(err.safeMessage);
     // Safe message is clean of any body content.
     expect(err.safeMessage).not.toMatch(/REDACTED/);
     expect(err.safeMessage).toMatch(/RAG/);
@@ -92,4 +111,85 @@ describe('UpstreamHttpError', () => {
     expect(isUpstreamHttpError(null)).toBe(false);
     expect(isUpstreamHttpError('string')).toBe(false);
   });
+
+  it('produces distinct safeMessage branches for 401 / 403 / 404 / 429', () => {
+    const e401 = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(401),
+      '',
+      '/api/v1/search',
+    );
+    expect(e401.safeMessage).toMatch(/authentication failed/i);
+    expect(e401.safeMessage).toMatch(/401/);
+
+    const e403 = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(403),
+      '',
+      '/api/v1/search',
+    );
+    expect(e403.safeMessage).toMatch(/authentication failed/i);
+    expect(e403.safeMessage).toMatch(/403/);
+
+    const e404 = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(404),
+      '',
+      '/api/v1/docs/123',
+    );
+    expect(e404.safeMessage).toMatch(/not found/i);
+    expect(e404.safeMessage).toMatch(/404/);
+
+    const e429 = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(429),
+      '',
+      '/api/v1/search',
+    );
+    expect(e429.safeMessage).toMatch(/throttling/i);
+    expect(e429.safeMessage).toMatch(/429/);
+  });
+
+  it('parses Retry-After header into retryAfterMs', () => {
+    expect(parseRetryAfterMs('30')).toBe(30000);
+    expect(parseRetryAfterMs('0')).toBe(0);
+    expect(parseRetryAfterMs('')).toBeUndefined();
+    expect(parseRetryAfterMs(null)).toBeUndefined();
+    expect(parseRetryAfterMs('not-a-number')).toBeUndefined();
+
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(429, { headers: { 'retry-after': '30' } }),
+      '',
+      '/api/v1/search',
+    );
+    expect(err.retryAfterMs).toBe(30000);
+  });
+
+  it('defaults endpoint to response.url when caller omits it', () => {
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(500, { url: 'http://rag/api/v1/search' }),
+      '',
+    );
+    expect(err.endpoint).toBe('http://rag/api/v1/search');
+    expect(err.safeMessage).toContain('http://rag/api/v1/search');
+  });
+
+  it('toConvexError marshals structured fields for the client boundary', () => {
+    const err = UpstreamHttpError.fromResponse(
+      'rag',
+      makeResponse(429, { headers: { 'retry-after': '5' } }),
+      'rate limited',
+      '/api/v1/search',
+    );
+    const cv = err.toConvexError();
+    expect(cv).toBeInstanceOf(ConvexError);
+    expect(cv.data.code).toBe('upstream_http');
+    expect(cv.data.service).toBe('rag');
+    expect(cv.data.status).toBe(429);
+    expect(cv.data.retryable).toBe(true);
+    expect(cv.data.retryAfterMs).toBe(5000);
+    expect(cv.data.safeMessage).toMatch(/429/);
+  });
 });
diff --git a/services/platform/convex/lib/errors/upstream_http_error.ts b/services/platform/convex/lib/errors/upstream_http_error.ts
index 6157044622..7cb3e2b9f9 100644
--- a/services/platform/convex/lib/errors/upstream_http_error.ts
+++ b/services/platform/convex/lib/errors/upstream_http_error.ts
@@ -7,13 +7,21 @@
  *   fragments never reach a thrown Error message.
  * - `retryable` flag derived from status, so callers can decide
  *   without re-parsing the message.
+ * - `Retry-After` parsing — set on `retryAfterMs` when the upstream
+ *   provided one and we should honor it before retrying.
  * - A `safeMessage` field with a user-presentable one-liner that
- *   omits the body snippet entirely; UI surfaces should prefer this.
+ *   omits the body snippet entirely. `.message` equals `.safeMessage`
+ *   so that the default error surfacing across the Convex client
+ *   boundary (which only carries `error.message`) does not leak the
+ *   raw body to UI toasts. Engineers reading server logs should
+ *   inspect `.bodySnippet` for the scrubbed body excerpt.
  *
  * Use the static factory `UpstreamHttpError.fromResponse(...)`; raw
  * `new UpstreamHttpError({...})` is reserved for tests.
  */
 
+import { ConvexError } from 'convex/values';
+
 import { sanitizeError } from '../utils/sanitize_secrets';
 
 export type UpstreamService = 'rag' | 'crawler';
@@ -27,6 +35,7 @@ export interface UpstreamErrorInit {
   bodySnippet: string;
   retryable: boolean;
   safeMessage: string;
+  retryAfterMs?: number;
 }
 
 /** Status codes the platform should retry on (transient upstream). */
@@ -34,13 +43,35 @@ export function isRetryableStatus(status: number): boolean {
   return status === 408 || status === 429 || (status >= 500 && status < 600);
 }
 
+/**
+ * Parse the upstream `Retry-After` header into milliseconds. Supports
+ * both the integer-seconds and HTTP-date forms per RFC 9110 §10.2.3.
+ * Returns `undefined` when the header is missing or unparseable so
+ * callers can fall back to a default backoff.
+ */
+export function parseRetryAfterMs(value: string | null): number | undefined {
+  if (!value) return undefined;
+  const trimmed = value.trim();
+  if (!trimmed) return undefined;
+  const asInt = Number(trimmed);
+  if (Number.isFinite(asInt) && asInt >= 0) {
+    return Math.round(asInt * 1000);
+  }
+  const asDate = Date.parse(trimmed);
+  if (!Number.isNaN(asDate)) {
+    const delta = asDate - Date.now();
+    return delta > 0 ? delta : 0;
+  }
+  return undefined;
+}
+
 function safeMessageFor(
   service: UpstreamService,
   status: number,
   endpoint: string,
 ): string {
   // User-facing summary: never includes body, never includes secrets.
-  // Operators get the full picture from logs + the thrown Error message.
+  // Operators get the full picture from logs + .bodySnippet.
   const where = `${service.toUpperCase()} ${endpoint}`;
   if (status === 401 || status === 403) {
     return `${where} authentication failed (HTTP ${status}).`;
@@ -64,13 +95,14 @@ export class UpstreamHttpError extends Error {
   readonly bodySnippet: string;
   readonly retryable: boolean;
   readonly safeMessage: string;
+  readonly retryAfterMs?: number;
 
   constructor(init: UpstreamErrorInit) {
-    // Engineer-facing message: includes the scrubbed snippet for log
-    // triage. UI code MUST read `.safeMessage` instead of `.message`
-    // to keep this snippet out of user-visible surfaces.
-    const snippet = init.bodySnippet ? ` — ${init.bodySnippet}` : '';
-    super(`${init.safeMessage}${snippet}`);
+    // `.message` carries the safe summary only. The body snippet lives
+    // in a separate field that server-side logs can include explicitly.
+    // This keeps the snippet out of the Convex client-boundary error
+    // shape, which only preserves `error.message`.
+    super(init.safeMessage);
     this.name = 'UpstreamHttpError';
     this.service = init.service;
     this.status = init.status;
@@ -78,28 +110,75 @@ export class UpstreamHttpError extends Error {
     this.bodySnippet = init.bodySnippet;
     this.retryable = init.retryable;
     this.safeMessage = init.safeMessage;
+    if (init.retryAfterMs !== undefined) this.retryAfterMs = init.retryAfterMs;
   }
 
   /**
    * Build an UpstreamHttpError from a non-2xx Response and its already-read
    * body text. Callers should always `await response.text()` first (don't
    * pass the unread Response — single-use body).
+   *
+   * `endpoint` defaults to `response.url` so callers no longer have to
+   * pass the URL twice.
    */
   static fromResponse(
     service: UpstreamService,
     response: Response,
     bodyText: string,
-    endpoint: string,
+    endpoint?: string,
   ): UpstreamHttpError {
+    const ep = endpoint ?? response.url;
+    // `response.headers` is always set on a real `Response`, but unit
+    // tests mock the response shape as a bare object and may omit
+    // headers. Defensive `?.` so the helper still produces a usable
+    // error in those cases (retryAfterMs stays undefined).
+    const retryAfter =
+      typeof response.headers?.get === 'function'
+        ? response.headers.get('retry-after')
+        : null;
     return new UpstreamHttpError({
       service,
       status: response.status,
-      endpoint,
+      endpoint: ep,
       bodySnippet: sanitizeError(bodyText, BODY_SNIPPET_MAX),
       retryable: isRetryableStatus(response.status),
-      safeMessage: safeMessageFor(service, response.status, endpoint),
+      safeMessage: safeMessageFor(service, response.status, ep),
+      retryAfterMs: parseRetryAfterMs(retryAfter),
     });
   }
+
+  /**
+   * Convert to a `ConvexError` that carries the structured fields
+   * across the Convex client boundary. Plain Error subclasses lose
+   * their `.bodySnippet` / `.retryable` / `.status` on the wire —
+   * Convex only marshals `ConvexError.data`. Use this when throwing
+   * from an action that an end-user-facing flow consumes.
+   */
+  toConvexError(): ConvexError<{
+    code: 'upstream_http';
+    service: UpstreamService;
+    status: number;
+    retryable: boolean;
+    safeMessage: string;
+    retryAfterMs?: number;
+  }> {
+    const data: {
+      code: 'upstream_http';
+      service: UpstreamService;
+      status: number;
+      retryable: boolean;
+      safeMessage: string;
+      retryAfterMs?: number;
+    } = {
+      code: 'upstream_http',
+      service: this.service,
+      status: this.status,
+      retryable: this.retryable,
+      safeMessage: this.safeMessage,
+    };
+    if (this.retryAfterMs !== undefined) data.retryAfterMs = this.retryAfterMs;
+    return new ConvexError(data);
+  }
 }
 
 /** Narrow `unknown` to UpstreamHttpError for catch-block branching. */
diff --git a/services/platform/convex/lib/file_io.ts b/services/platform/convex/lib/file_io.ts
index 3161e10e25..c2746b0e35 100644
--- a/services/platform/convex/lib/file_io.ts
+++ b/services/platform/convex/lib/file_io.ts
@@ -22,7 +22,8 @@ import {
 } from 'node:fs/promises';
 import path from 'node:path';
 
-const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+import { isValidOrgSlug as sharedIsValidOrgSlug } from '../../lib/shared/constants/org-slug';
+
 const TIMESTAMP_REGEX = /^\d{13,}(-[a-f0-9]+)?$/;
 
 export type FileReadResult<T> =
@@ -70,7 +71,53 @@ export async function isSymlink(filePath: string): Promise<boolean> {
 }
 
 export function validateOrgSlug(orgSlug: string): boolean {
-  return orgSlug === 'default' || ORG_SLUG_REGEX.test(orgSlug);
+  return sharedIsValidOrgSlug(orgSlug);
+}
+
+/**
+ * Resolve the on-disk root for all org-scoped config from the
+ * `TALE_CONFIG_DIR` env var. Each domain module used to inline its own
+ * copy of this; centralizing prevents the error-message drift previous
+ * reviews caught.
+ *
+ * Optional `area` suffix is included in the error message when the env
+ * var is missing, so the operator sees which catalog they were trying
+ * to access ("agents", "providers", etc.).
+ */
+export function getConfigRoot(area?: string): string {
+  const configDir = process.env.TALE_CONFIG_DIR;
+  if (configDir) return configDir;
+  const suffix = area ? ` so ${area} can be resolved` : '';
+  throw new Error(
+    `TALE_CONFIG_DIR environment variable is not set. ` +
+      `Set it to the root config directory ` +
+      `(e.g., TALE_CONFIG_DIR=/path/to/tale/examples)${suffix}.`,
+  );
+}
+
+/**
+ * Join `name` onto `dir` and refuse anything that escapes `dir`.
+ *
+ * Catches `..`-style traversal as well as absolute-path injection.
+ * Centralized so every domain module's resolver gets the same guard
+ * with the same error shape — previous review found this block
+ * copy-pasted in 9 places.
+ *
+ * Use this for the leaf-name leg only (after the org-slug has been
+ * validated and joined). Pass a pre-validated `name` whose shape is
+ * already restricted by a per-domain regex; this helper is a
+ * defense-in-depth backstop, not the primary validator.
+ */
+export function safeJoinWithinDir(dir: string, name: string): string {
+  const resolved = path.resolve(dir, name);
+  const expectedPrefix = path.resolve(dir);
+  if (
+    !resolved.startsWith(expectedPrefix + path.sep) &&
+    resolved !== expectedPrefix
+  ) {
+    throw new Error(`Path traversal detected: ${name}`);
+  }
+  return resolved;
 }
 
 export function validateTimestamp(ts: string): boolean {
diff --git a/services/platform/convex/lib/helpers/rag_config.ts b/services/platform/convex/lib/helpers/rag_config.ts
index c4ab9ade96..f2838f8033 100644
--- a/services/platform/convex/lib/helpers/rag_config.ts
+++ b/services/platform/convex/lib/helpers/rag_config.ts
@@ -250,7 +250,17 @@ export async function ragFetch(
   // directly. When omitted, the RAG endpoint either runs org-agnostic
   // (status/delete/content/compare-by-id) or returns 400 from its
   // `Depends(require_org_slug)` dep (search/generate/upload/compare-files).
-  if (init.orgSlug) {
+  //
+  // Distinguish "caller deliberately passed empty/blank slug" (a bug —
+  // fail fast, don't silently strip the header) from "caller omitted
+  // the field entirely" (the org-agnostic endpoint path). Earlier the
+  // truthy check folded both into the same silent-omit branch.
+  if (init.orgSlug !== undefined) {
+    if (!init.orgSlug.trim()) {
+      throw new Error(
+        'ragFetch: orgSlug was provided but is empty; refusing to call RAG without a valid X-Tale-Org header',
+      );
+    }
     headers.set('x-tale-org', init.orgSlug);
   }
 
diff --git a/services/platform/convex/organizations/reseed_all_orgs.ts b/services/platform/convex/organizations/reseed_all_orgs.ts
index 19361126b5..c5b422ed9f 100644
--- a/services/platform/convex/organizations/reseed_all_orgs.ts
+++ b/services/platform/convex/organizations/reseed_all_orgs.ts
@@ -29,18 +29,11 @@
 
 import { v } from 'convex/values';
 
+import { isValidOrgSlug } from '../../lib/shared/constants/org-slug';
 import { getString, isRecord } from '../../lib/utils/type-guards';
 import { components, internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
 
-// Inlined to avoid importing from convex/lib/file_io.ts (which has 'use node'
-// and would force this orchestration action into the Node runtime). Keep in
-// sync with `validateOrgSlug` at services/platform/convex/lib/file_io.ts.
-const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
-function isValidOrgSlug(slug: string): boolean {
-  return slug === 'default' || ORG_SLUG_REGEX.test(slug);
-}
-
 type OrgReseedResult =
   | { slug: string; status: 'ok' }
   | { slug: string; status: 'error'; error: string };
@@ -65,9 +58,28 @@ export const reseedAllOrgsFromBuiltin = internalAction({
   handler: async (ctx) => {
     const slugSet = new Set<string>(['default']);
 
+    const invalidSlugs: string[] = [];
     let cursor: string | null = null;
+    let prevCursor: string | null | undefined;
     let isDone = false;
+    // Defensive cap: if the betterAuth adapter ever returned
+    // `{ isDone: false }` with a non-advancing cursor, the loop would
+    // spin forever within the Convex action wall-clock. Break out so
+    // the operator gets a real error instead of a 30-minute timeout.
+    const MAX_PAGES = 1000;
+    let pages = 0;
     while (!isDone) {
+      if (pages++ >= MAX_PAGES) {
+        throw new Error(
+          `reseedAllOrgs: betterAuth pagination did not terminate within ${MAX_PAGES} pages`,
+        );
+      }
+      if (prevCursor !== undefined && cursor === prevCursor) {
+        throw new Error(
+          'reseedAllOrgs: betterAuth pagination cursor did not advance; aborting to avoid infinite loop',
+        );
+      }
+      prevCursor = cursor;
       const res: unknown = await ctx.runQuery(
         components.betterAuth.adapter.findMany,
         {
@@ -85,6 +97,10 @@ export const reseedAllOrgsFromBuiltin = internalAction({
           console.warn(
             `[reseedAllOrgs] skipping invalid slug "${slug}" returned by betterAuth`,
           );
+          // Surface invalid slugs through the structured `results`
+          // payload so the CLI summary and CI logs see them — earlier
+          // they were dropped silently and operators had no signal.
+          invalidSlugs.push(slug);
           continue;
         }
         slugSet.add(slug);
@@ -100,6 +116,17 @@ export const reseedAllOrgsFromBuiltin = internalAction({
     const slugs = Array.from(slugSet).sort();
     const results: OrgReseedResult[] = [];
 
+    // Surface any invalid-slug rows seen during pagination as
+    // structured error results. They never get reseeded (the orgs are
+    // unreachable until renamed), but the CLI summary now lists them.
+    for (const bad of invalidSlugs) {
+      results.push({
+        slug: bad,
+        status: 'error',
+        error: 'invalid slug shape; cannot reseed',
+      });
+    }
+
     for (const slug of slugs) {
       try {
         await ctx.runAction(
diff --git a/services/platform/convex/organizations/resolve_org_slug.test.ts b/services/platform/convex/organizations/resolve_org_slug.test.ts
index 201f6c9330..911500feac 100644
--- a/services/platform/convex/organizations/resolve_org_slug.test.ts
+++ b/services/platform/convex/organizations/resolve_org_slug.test.ts
@@ -21,14 +21,14 @@ describe('resolveOrgSlug', () => {
   it('throws when the organization is not found', async () => {
     const ctx = makeCtx(null);
     await expect(resolveOrgSlug(ctx as never, 'org_missing')).rejects.toThrow(
-      /Organization org_missing not found or missing slug/,
+      /no organization row found for id .*org_missing/,
     );
   });
 
   it('throws when the organization row is missing a slug field', async () => {
     const ctx = makeCtx({ _id: 'org_abc' });
     await expect(resolveOrgSlug(ctx as never, 'org_abc')).rejects.toThrow(
-      /Organization org_abc not found or missing slug/,
+      /organization .*org_abc.* has no slug/,
     );
   });
 });
diff --git a/services/platform/convex/organizations/resolve_org_slug.ts b/services/platform/convex/organizations/resolve_org_slug.ts
index fb3b1041da..a7419e934e 100644
--- a/services/platform/convex/organizations/resolve_org_slug.ts
+++ b/services/platform/convex/organizations/resolve_org_slug.ts
@@ -1,22 +1,10 @@
-import { isRecord, getString } from '../../lib/utils/type-guards';
-import { components } from '../_generated/api';
-import type { ActionCtx, MutationCtx, QueryCtx } from '../_generated/server';
-
-type AnyCtx = QueryCtx | MutationCtx | ActionCtx;
-
-export async function resolveOrgSlug(
-  ctx: AnyCtx,
-  organizationId: string,
-): Promise<string> {
-  const org = await ctx.runQuery(components.betterAuth.adapter.findOne, {
-    model: 'organization',
-    where: [{ field: '_id', value: organizationId, operator: 'eq' }],
-  });
-
-  const orgRecord = isRecord(org) ? org : undefined;
-  const slug = orgRecord ? getString(orgRecord, 'slug') : undefined;
-  if (!slug) {
-    throw new Error(`Organization ${organizationId} not found or missing slug`);
-  }
-  return slug;
-}
+/**
+ * @deprecated Re-export of `orgSlugFromId` from `lib/helpers/org_slug`.
+ *
+ * This module used to host its own implementation; that body has been
+ * removed and the function now delegates to the canonical helper so
+ * there is one source of truth. Existing callers continue to import
+ * `resolveOrgSlug` from here; new code should prefer
+ * `import { orgSlugFromId } from '../lib/helpers/org_slug'`.
+ */
+export { orgSlugFromId as resolveOrgSlug } from '../lib/helpers/org_slug';
diff --git a/services/platform/convex/organizations/scaffold.test.ts b/services/platform/convex/organizations/scaffold.test.ts
index 5392eac0a3..3844930578 100644
--- a/services/platform/convex/organizations/scaffold.test.ts
+++ b/services/platform/convex/organizations/scaffold.test.ts
@@ -26,8 +26,12 @@ const { scaffoldNewOrganization, cleanupOrgFilesystem } =
 type ActionConfig = {
   handler: (
     ctx: never,
-    args: { orgSlug: string; override?: boolean },
-  ) => Promise<unknown>;
+    args: { orgSlug: string; override?: boolean; strict?: boolean },
+  ) => Promise<{
+    ok: boolean;
+    skipped: boolean;
+    results: Array<{ domain: string; ok: boolean; error?: string }>;
+  }>;
 };
 const scaffoldHandler = (scaffoldNewOrganization as unknown as ActionConfig)
   .handler;
@@ -387,6 +391,94 @@ describe('scaffoldNewOrganization (org-first)', () => {
       ),
     ).toBe('{"name":"existing"}');
   });
+
+  it('refuses invalid org slugs with skipped:true (no fs writes)', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    // Populate something the scaffolder would normally seed so we can
+    // be sure the refusal happens BEFORE any writes.
+    await writeText(catSrc('agents', 'a.json'), '{}');
+
+    const result = await scaffoldHandler({} as never, {
+      orgSlug: '../escape',
+    });
+
+    expect(result.ok).toBe(false);
+    expect(result.skipped).toBe(true);
+    expect(result.results).toEqual([]);
+    // Nothing under the (invalid) slug should exist on disk.
+    expect(existsSync(orgDst('../escape'))).toBe(false);
+  });
+
+  it('seedRetention override:true overwrites; override:false skips existing', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('retention.json'), '{"defaults":"new"}');
+    // Pre-existing per-org retention.json simulates an operator edit.
+    await writeText(
+      orgDst('acme', 'retention.json'),
+      '{"defaults":"existing"}',
+    );
+
+    // override:false → operator file survives.
+    await scaffoldHandler({} as never, {
+      orgSlug: 'acme',
+      override: false,
+    });
+    expect(await readFile(orgDst('acme', 'retention.json'), 'utf-8')).toBe(
+      '{"defaults":"existing"}',
+    );
+
+    // override:true → catalog file wins.
+    await scaffoldHandler({} as never, {
+      orgSlug: 'acme',
+      override: true,
+    });
+    expect(await readFile(orgDst('acme', 'retention.json'), 'utf-8')).toBe(
+      '{"defaults":"new"}',
+    );
+  });
+
+  it('strict:true throws with aggregated per-domain failure detail', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    // Make the catalog's agents/ source unreadable by replacing the
+    // expected directory with a regular file — the scaffolder's
+    // per-domain copy will fail and the strict gate aggregates it.
+    await writeText(catSrc('agents'), 'not-a-directory');
+    await writeText(catSrc('workflows', 'general', 'a.json'), '{"ok":true}');
+
+    let threw: Error | null = null;
+    try {
+      await scaffoldHandler({} as never, {
+        orgSlug: 'acme',
+        strict: true,
+      });
+    } catch (err) {
+      threw = err as Error;
+    }
+
+    expect(threw).not.toBeNull();
+    // Aggregated message must name the failing domain so operators
+    // can act on it without trawling logs. Non-strict mode (covered
+    // below) folds the same shape into a result without throwing.
+    expect(threw?.message ?? '').toMatch(/scaffold "acme"/);
+    expect(threw?.message ?? '').toMatch(/agents/);
+  });
+
+  it('non-strict aggregates failures into result without throwing', async () => {
+    process.env.TALE_CONFIG_BUILTIN_DIR = catalogRoot;
+    await writeText(catSrc('agents'), 'not-a-directory');
+    await writeText(catSrc('workflows', 'general', 'a.json'), '{"ok":true}');
+
+    const result = await scaffoldHandler({} as never, {
+      orgSlug: 'acme',
+      // strict defaults to false — caller gets the result object back.
+    });
+
+    expect(result.ok).toBe(false);
+    const failedDomains = result.results
+      .filter((r) => !r.ok)
+      .map((r) => r.domain);
+    expect(failedDomains).toContain('agents');
+  });
 });
 
 describe('cleanupOrgFilesystem (symlink + traversal defense)', () => {
diff --git a/services/platform/convex/providers/file_utils.ts b/services/platform/convex/providers/file_utils.ts
index a55c7fd2e0..3d5a740f70 100644
--- a/services/platform/convex/providers/file_utils.ts
+++ b/services/platform/convex/providers/file_utils.ts
@@ -10,7 +10,13 @@ import {
   providerJsonSchema,
   providerSecretsSchema,
 } from '../../lib/shared/schemas/providers';
-import { serializeJson, sha256, validateOrgSlug } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  serializeJson,
+  sha256,
+  validateOrgSlug,
+} from '../lib/file_io';
 import { validateProviderName } from './validators';
 
 export { sha256, validateProviderName };
@@ -92,16 +98,10 @@ export function parseProviderSecrets(
   return result.data;
 }
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error('TALE_CONFIG_DIR environment variable is not set.');
-}
-
 export function resolveProvidersDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug))
     throw new Error(`Invalid org slug: ${orgSlug}`);
-  return path.join(getConfigRoot(), orgSlug, 'providers');
+  return path.join(getConfigRoot('providers'), orgSlug, 'providers');
 }
 
 export function resolveProviderFilePath(
@@ -110,16 +110,10 @@ export function resolveProviderFilePath(
 ): string {
   if (!validateProviderName(providerName))
     throw new Error(`Invalid provider name: ${providerName}`);
-  const dir = resolveProvidersDir(orgSlug);
-  const resolved = path.resolve(dir, `${providerName}.json`);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${providerName}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(
+    resolveProvidersDir(orgSlug),
+    `${providerName}.json`,
+  );
 }
 
 export function resolveProviderSecretsPath(
@@ -128,16 +122,10 @@ export function resolveProviderSecretsPath(
 ): string {
   if (!validateProviderName(providerName))
     throw new Error(`Invalid provider name: ${providerName}`);
-  const dir = resolveProvidersDir(orgSlug);
-  const resolved = path.resolve(dir, `${providerName}.secrets.json`);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${providerName}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(
+    resolveProvidersDir(orgSlug),
+    `${providerName}.secrets.json`,
+  );
 }
 
 export { MAX_FILE_SIZE_BYTES };
diff --git a/services/platform/convex/skills/file_utils.ts b/services/platform/convex/skills/file_utils.ts
index 3af44e3b14..65e6bf803e 100644
--- a/services/platform/convex/skills/file_utils.ts
+++ b/services/platform/convex/skills/file_utils.ts
@@ -30,7 +30,13 @@ import {
   SKILL_NAME_REGEX,
   type SkillFrontmatter,
 } from '../../lib/shared/schemas/skills';
-import { sha256, validateOrgSlug, verifyPathWithinBase } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  sha256,
+  validateOrgSlug,
+  verifyPathWithinBase,
+} from '../lib/file_io';
 
 /**
  * Names reserved by the SKILL.md frontmatter schema. Duplicated here so
@@ -94,16 +100,6 @@ export function validateSkillSlug(slug: string): boolean {
   return true;
 }
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error(
-    'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set it to the root config directory ' +
-      '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
-  );
-}
-
 /**
  * Resolve the skills directory for an organization. Org-first:
  * `${TALE_CONFIG_DIR}/<orgSlug>/skills/`.
@@ -112,23 +108,14 @@ export function resolveSkillsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  return path.join(getConfigRoot(), orgSlug, 'skills');
+  return path.join(getConfigRoot('skills'), orgSlug, 'skills');
 }
 
 export function resolveSkillDir(orgSlug: string, slug: string): string {
   if (!validateSkillSlug(slug)) {
     throw new Error(`Invalid skill slug: ${slug}`);
   }
-  const dir = resolveSkillsDir(orgSlug);
-  const resolved = path.resolve(dir, slug);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${slug}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(resolveSkillsDir(orgSlug), slug);
 }
 
 export function resolveSkillMdPath(orgSlug: string, slug: string): string {
@@ -149,17 +136,11 @@ export function resolveSkillAssetPath(
 ): string {
   validateAssetRelPath(relPath);
   const skillDir = resolveSkillDir(orgSlug, slug);
-  const resolved = path.resolve(skillDir, relPath);
-  const expectedPrefix = path.resolve(skillDir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${relPath}`);
-  }
+  const resolved = safeJoinWithinDir(skillDir, relPath);
   // Case-fold the SKILL.md lockout — on case-insensitive filesystems (macOS
   // default, Windows) `skill.md` resolves to the same inode as `SKILL.md`
   // but a literal `===` compare would miss it.
+  const expectedPrefix = path.resolve(skillDir);
   const finalSegment = path.basename(resolved);
   if (
     path.dirname(resolved) === expectedPrefix &&
diff --git a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
index 78d6bc7db3..63cb99375e 100644
--- a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
@@ -1,6 +1,7 @@
 import { v } from 'convex/values';
 
 import { createDebugLog } from '../../../lib/debug_log';
+import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import type { ActionDefinition } from '../../helpers/nodes/action/types';
 import type {
@@ -141,8 +142,13 @@ async function discoverUrls(
   clearTimeout(timeoutId);
 
   if (!response.ok) {
-    const errorText = await response.text();
-    throw new Error(`Crawler service error (${response.status}): ${errorText}`);
+    const errorText = await response.text().catch(() => '');
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      '/api/v1/urls/discover',
+    );
   }
 
   const result: DiscoverUrlsRawData = await response.json();
@@ -199,8 +205,13 @@ async function fetchUrls(
   clearTimeout(timeoutId);
 
   if (!response.ok) {
-    const errorText = await response.text();
-    throw new Error(`Crawler service error (${response.status}): ${errorText}`);
+    const errorText = await response.text().catch(() => '');
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      '/api/v1/urls/fetch',
+    );
   }
 
   const result: FetchUrlsData = await response.json();
@@ -250,8 +261,13 @@ async function queryUrls(
   clearTimeout(timeoutId);
 
   if (!response.ok) {
-    const errorText = await response.text();
-    throw new Error(`Crawler service error (${response.status}): ${errorText}`);
+    const errorText = await response.text().catch(() => '');
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      `/api/v1/websites/${params.domain}/urls`,
+    );
   }
 
   const result: QueryUrlsRawData = await response.json();
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index 42c567c370..5bdcf38c3b 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -582,6 +582,8 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
       }
 
       case 'extract_docx_structured': {
+        // Cross-tenant gate: caller-supplied fileId could reference any
+        // org's storage; verify ownership before reading.
         const organizationId =
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
@@ -591,10 +593,20 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
             'extract_docx_structured requires organizationId in workflow _variables.',
           );
         }
+        const ownsStorage = await ctx.runQuery(
+          internal.documents.internal_queries.verifyStorageIdsBelongToOrg,
+          { organizationId, storageIds: [params.fileId] },
+        );
+        if (!ownsStorage) {
+          throw new Error('fileId does not belong to this organization');
+        }
         return await extractDocxStructured(ctx, params.fileId, organizationId);
       }
 
       case 'apply_docx_structured': {
+        // Cross-tenant gate: templateFileId could reference any org's
+        // storage; verify ownership before reading + writing derived
+        // output back into the caller's library.
         const organizationId =
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
@@ -604,6 +616,15 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
             'apply_docx_structured requires organizationId in workflow _variables.',
           );
         }
+        const ownsTemplate = await ctx.runQuery(
+          internal.documents.internal_queries.verifyStorageIdsBelongToOrg,
+          { organizationId, storageIds: [params.templateFileId] },
+        );
+        if (!ownsTemplate) {
+          throw new Error(
+            'templateFileId does not belong to this organization',
+          );
+        }
 
         return await applyDocxStructured(ctx, {
           templateFileId: params.templateFileId,
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
index 3ea45ce39f..4142fa69dd 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
@@ -19,6 +19,7 @@ import {
   getCrawlerUrl,
 } from '../../../../documents/generate_document_helpers';
 import { createDebugLog } from '../../../../lib/debug_log';
+import { UpstreamHttpError } from '../../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../../lib/helpers/org_slug';
 import { toId } from '../../../../lib/type_cast_helpers';
 
@@ -130,8 +131,11 @@ export async function applyDocxStructured(
 
   if (!response.ok) {
     const errorText = await response.text().catch(() => '');
-    throw new Error(
-      `Crawler apply-structured failed: ${response.status} ${errorText}`,
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      '/api/v1/apply-structured',
     );
   }
 
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
index 4a7d1fc5b1..63a13dd5b5 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/extract_docx_structured.ts
@@ -11,6 +11,7 @@ import { fetchJson } from '../../../../../lib/utils/type-cast-helpers';
 import type { ActionCtx } from '../../../../_generated/server';
 import { getCrawlerUrl } from '../../../../documents/generate_document_helpers';
 import { createDebugLog } from '../../../../lib/debug_log';
+import { UpstreamHttpError } from '../../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../../lib/helpers/org_slug';
 import { toId } from '../../../../lib/type_cast_helpers';
 
@@ -71,8 +72,11 @@ export async function extractDocxStructured(
 
   if (!response.ok) {
     const errorText = await response.text().catch(() => '');
-    throw new Error(
-      `Crawler extract-structured failed: ${response.status} ${errorText}`,
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
+      errorText,
+      '/api/v1/docx/extract-structured',
     );
   }
 
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
index 6fa937f035..e588411693 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
@@ -79,15 +79,27 @@ describe('deleteDocumentById', () => {
     expect(result.message).toBe('Deleted 2 docs');
   });
 
-  it('returns error result on HTTP failure', async () => {
-    mockFetch({ detail: 'service error' }, 500);
+  it('returns structured failure on non-retryable HTTP error', async () => {
+    // 400 is non-retryable per `isRetryableStatus`; the helper folds
+    // it into `{ success: false }` rather than re-throwing.
+    mockFetch({ detail: 'bad request' }, 400);
 
     const result = await deleteDocumentById({
       fileId: 'doc-fail',
     });
 
     expect(result.success).toBe(false);
-    expect(result.error).toContain('500');
+    expect(result.error).toMatch(/HTTP 400|400/);
+  });
+
+  it('re-throws retryable upstream failures so callers can retry', async () => {
+    // 5xx is retryable; folding it into `{ success: false }` would
+    // mask transient RAG outages as permanent retention failures.
+    mockFetch({ detail: 'service error' }, 500);
+
+    await expect(
+      deleteDocumentById({ fileId: 'doc-fail-5xx' }),
+    ).rejects.toThrow(/HTTP 500|unavailable/);
   });
 
   // Round-2 review HIGH (E.4.2): retention re-runs and cascade RAG
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
index 29d033c86b..b0b00fc6b9 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
@@ -8,7 +8,10 @@ import {
   isRecord,
 } from '../../../../../lib/utils/type-guards';
 import { internalAction } from '../../../../_generated/server';
-import { UpstreamHttpError } from '../../../../lib/errors/upstream_http_error';
+import {
+  isUpstreamHttpError,
+  UpstreamHttpError,
+} from '../../../../lib/errors/upstream_http_error';
 import { ragFetch } from '../../../../lib/helpers/rag_config';
 import type { RagDeleteResult } from './types';
 
@@ -81,6 +84,17 @@ export async function deleteDocumentById({
       timestamp: Date.now(),
     };
   } catch (error) {
+    // Retryable upstream failures (5xx / 429 / 408) must propagate so
+    // the action-level retry harness (workflow scheduler or callsite
+    // re-try loop) gets a chance to recover. Folding them into
+    // `{ success: false }` silently — as the earlier code did —
+    // converted transient unavailability into permanent failures.
+    // Non-retryable failures (4xx) and non-UpstreamHttpError throws
+    // still return the structured result so batch callers don't abort
+    // the whole loop on a single bad fileId.
+    if (isUpstreamHttpError(error) && error.retryable) {
+      throw error;
+    }
     const errorMessage = error instanceof Error ? error.message : String(error);
     return {
       success: false,
@@ -110,11 +124,22 @@ export const deleteFromRagBatch = internalAction({
   returns: v.null(),
   handler: async (_ctx, args) => {
     for (const fileId of args.fileIds) {
-      const result = await deleteDocumentById({ fileId });
-      if (!result.success) {
+      // Best-effort per file. `deleteDocumentById` re-throws retryable
+      // UpstreamHttpError, but in a batch context one transient 5xx
+      // should not abort cleanup of the other ids — log + move on so
+      // the next retention sweep gets to retry.
+      try {
+        const result = await deleteDocumentById({ fileId });
+        if (!result.success) {
+          console.warn(
+            `[deleteFromRagBatch] delete failed for ${fileId}:`,
+            result.error ?? result.message,
+          );
+        }
+      } catch (err) {
         console.warn(
-          `[deleteFromRagBatch] delete failed for ${fileId}:`,
-          result.error ?? result.message,
+          `[deleteFromRagBatch] retryable upstream error on ${fileId}; skipping:`,
+          err instanceof Error ? err.message : String(err),
         );
       }
     }
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
index 25473ca191..6b006d0240 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/upload_file_direct.test.ts
@@ -150,9 +150,13 @@ describe('uploadFile', () => {
     );
     expect(err).toBeInstanceOf(Error);
     expect((err as Error).name).toBe('UpstreamHttpError');
-    // Engineer-facing .message embeds the safe-summary + sanitized body.
+    // `.message` carries the safe summary only (the sanitized body
+    // lives on `.bodySnippet` so it does not cross the Convex client
+    // boundary as a default error toast).
     expect((err as Error).message).toMatch(/HTTP 500/);
-    expect((err as Error).message).toMatch(/something broke/);
+    expect((err as { bodySnippet?: string }).bodySnippet).toMatch(
+      /something broke/,
+    );
     // Retryable for 5xx — caller can decide whether to bounce.
     expect((err as { retryable?: boolean }).retryable).toBe(true);
   });
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index d9a036d942..cdf0621d81 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -64,6 +64,9 @@ export const ragAction: ActionDefinition<RagActionParams> = {
         return { ...result, executionTimeMs: Date.now() - startTime };
       }
       case 'delete_document': {
+        // Cross-tenant gate: file_id is global in RAG; verify the workflow's
+        // org owns the storage row before forwarding the delete.
+        await assertStorageIdsInOrg(ctx, _variables, [migratedParams.fileId]);
         const result = await deleteDocumentById({
           fileId: migratedParams.fileId,
         });
diff --git a/services/platform/convex/workflows/file_utils.ts b/services/platform/convex/workflows/file_utils.ts
index 9bfcf3debd..62d7d687b1 100644
--- a/services/platform/convex/workflows/file_utils.ts
+++ b/services/platform/convex/workflows/file_utils.ts
@@ -13,7 +13,13 @@ import {
   workflowJsonSchema,
   type WorkflowJsonConfig,
 } from '../../lib/shared/schemas/workflows';
-import { serializeJson, sha256, validateOrgSlug } from '../lib/file_io';
+import {
+  getConfigRoot,
+  safeJoinWithinDir,
+  serializeJson,
+  sha256,
+  validateOrgSlug,
+} from '../lib/file_io';
 
 export { sha256 };
 
@@ -75,16 +81,6 @@ export function urlParamToSlug(param: string): string {
   return param.replace(new RegExp(SLUG_SEPARATOR, 'g'), '/');
 }
 
-function getConfigRoot(): string {
-  const configDir = process.env.TALE_CONFIG_DIR;
-  if (configDir) return configDir;
-  throw new Error(
-    'TALE_CONFIG_DIR environment variable is not set. ' +
-      'Set it to the root config directory ' +
-      '(e.g., TALE_CONFIG_DIR=/path/to/tale/examples).',
-  );
-}
-
 /**
  * Resolve the workflows directory for an organization. Org-first:
  * `${TALE_CONFIG_DIR}/<orgSlug>/workflows/`. No `@`-prefix collision concern
@@ -94,7 +90,7 @@ export function resolveWorkflowsDir(orgSlug: string): string {
   if (!validateOrgSlug(orgSlug)) {
     throw new Error(`Invalid org slug: ${orgSlug}`);
   }
-  return path.join(getConfigRoot(), orgSlug, 'workflows');
+  return path.join(getConfigRoot('workflows'), orgSlug, 'workflows');
 }
 
 /**
@@ -108,16 +104,10 @@ export function resolveWorkflowFilePath(
   if (!validateWorkflowSlug(workflowSlug)) {
     throw new Error(`Invalid workflow slug: ${workflowSlug}`);
   }
-  const dir = resolveWorkflowsDir(orgSlug);
-  const resolved = path.resolve(dir, `${workflowSlug}.json`);
-  const expectedPrefix = path.resolve(dir);
-  if (
-    !resolved.startsWith(expectedPrefix + path.sep) &&
-    resolved !== expectedPrefix
-  ) {
-    throw new Error(`Path traversal detected: ${workflowSlug}`);
-  }
-  return resolved;
+  return safeJoinWithinDir(
+    resolveWorkflowsDir(orgSlug),
+    `${workflowSlug}.json`,
+  );
 }
 
 /**
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index b9ac31862e..337cfea0aa 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -2,6 +2,8 @@ import { relative } from 'node:path';
 
 import chokidar from 'chokidar';
 
+import { ORG_SLUG_REGEX } from './shared/constants/org-slug';
+
 interface ConfigChangeEvent {
   type:
     | 'agents'
@@ -15,8 +17,6 @@ interface ConfigChangeEvent {
 }
 
 const ATOMIC_WRITE_TMP_RE = /\.\d+\.[a-f0-9]{8}\.tmp$/;
-// Must match validateOrgSlug at services/platform/convex/lib/file_io.ts.
-const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
 
 /**
  * Tail-debounce window for SSE invalidations: events arriving within this
diff --git a/services/platform/lib/shared/constants/org-slug.ts b/services/platform/lib/shared/constants/org-slug.ts
new file mode 100644
index 0000000000..3f94ce140a
--- /dev/null
+++ b/services/platform/lib/shared/constants/org-slug.ts
@@ -0,0 +1,32 @@
+/**
+ * Canonical org slug validator.
+ *
+ * Single source of truth: importable from Convex Node actions
+ * (`convex/lib/file_io.ts`), Convex regular query/mutation/action
+ * modules (`convex/organizations/reseed_all_orgs.ts`), the platform
+ * file-watcher (`lib/config-watcher.ts`), and the React side. Kept in
+ * `lib/shared/constants/` so it stays Node-runtime-neutral (no
+ * `'use node'`).
+ *
+ * Rules:
+ *   - Must start with a lowercase letter or digit
+ *   - Body may include lowercase letters, digits, `_`, `-`
+ *   - `'default'` is allowed as the reserved platform-seed org slug
+ *     even though every other check would still pass it; the explicit
+ *     short-circuit documents the invariant.
+ */
+export const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+
+/** Soft check — does NOT throw. Returns true for valid slugs. */
+export function isValidOrgSlug(slug: string): boolean {
+  return slug === 'default' || ORG_SLUG_REGEX.test(slug);
+}
+
+/** Hard check — throws `Error` with a uniform message on invalid input. */
+export function assertValidOrgSlug(slug: string): void {
+  if (!isValidOrgSlug(slug)) {
+    throw new Error(
+      `Invalid org slug "${slug}". Must match ${ORG_SLUG_REGEX.source}.`,
+    );
+  }
+}
diff --git a/services/platform/messages/de.json b/services/platform/messages/de.json
index 9b213c5676..b3a14062a3 100644
--- a/services/platform/messages/de.json
+++ b/services/platform/messages/de.json
@@ -4090,6 +4090,9 @@
       "createOrganization": "Organisation erstellen",
       "organizationName": "Organisationsname",
       "companyNameRequired": "Firmenname ist erforderlich",
+      "companyNameCharacterError": "Verwende nur Buchstaben, Ziffern, Leerzeichen, Bindestriche und Unterstriche und beginne mit einem Buchstaben oder einer Ziffer.",
+      "nameReserved": "Dieser Name ist von der Plattform reserviert.",
+      "identifierPreview": "Kennung: {slug}",
       "enterCompanyName": "Gib deinen Firmennamen ein",
       "organizationCreated": "Organisation erstellt!",
       "creating": "Erstelle...",
diff --git a/services/platform/messages/en.json b/services/platform/messages/en.json
index 24410f882d..3f1b920bbe 100644
--- a/services/platform/messages/en.json
+++ b/services/platform/messages/en.json
@@ -4347,6 +4347,9 @@
       "createOrganization": "Create organization",
       "organizationName": "Organization name",
       "companyNameRequired": "Company name is required",
+      "companyNameCharacterError": "Use letters, digits, spaces, hyphens, and underscores only, starting with a letter or digit.",
+      "nameReserved": "This name is reserved by the platform.",
+      "identifierPreview": "Identifier: {slug}",
       "enterCompanyName": "Enter your company name",
       "organizationCreated": "Organization created successfully!",
       "creating": "Creating...",
diff --git a/services/platform/messages/fr.json b/services/platform/messages/fr.json
index 34ea37f585..a6ea6db977 100644
--- a/services/platform/messages/fr.json
+++ b/services/platform/messages/fr.json
@@ -4091,6 +4091,9 @@
       "createOrganization": "Créer une organisation",
       "organizationName": "Nom de l'organisation",
       "companyNameRequired": "Le nom de l'entreprise est requis",
+      "companyNameCharacterError": "Utilise uniquement des lettres, chiffres, espaces, tirets et tirets bas, en commençant par une lettre ou un chiffre.",
+      "nameReserved": "Ce nom est réservé par la plateforme.",
+      "identifierPreview": "Identifiant : {slug}",
       "enterCompanyName": "Entre le nom de ton entreprise",
       "organizationCreated": "Organisation créée avec succès !",
       "creating": "Création...",
diff --git a/services/platform/server.ts b/services/platform/server.ts
index a02df47df1..56bda4d68e 100644
--- a/services/platform/server.ts
+++ b/services/platform/server.ts
@@ -1,5 +1,5 @@
 import { existsSync } from 'node:fs';
-import { dirname, join, resolve } from 'node:path';
+import { dirname, join, resolve, sep } from 'node:path';
 import { fileURLToPath } from 'node:url';
 
 import { createPrecompiledServer, type ArtifactsServer } from '@tale/ui/seo';
@@ -364,6 +364,20 @@ export function createApp(env: EnvConfig = getEnvConfig()): Hono {
     convexMetricsResponse(c.req.query('format') ?? null),
   );
 
+  // Branding images. Defense-in-depth: filename is already locked
+  // down (no `/`, no `..`), but the prefix check uses `path.sep` so a
+  // future sibling dir like `imagesXYZ/` cannot prefix-match via
+  // string compare. We also pin Content-Type from an allowlist
+  // instead of letting Bun.file infer it from the extension so a
+  // mis-renamed file cannot be served with a script-y content type.
+  const BRANDING_MIME: Record<string, string> = {
+    png: 'image/png',
+    svg: 'image/svg+xml',
+    jpg: 'image/jpeg',
+    jpeg: 'image/jpeg',
+    webp: 'image/webp',
+    ico: 'image/x-icon',
+  };
   app.get('/branding/images/:filename', async (c) => {
     if (!brandingImagesDir) return c.notFound();
     const filename = c.req.param('filename');
@@ -371,11 +385,22 @@ export function createApp(env: EnvConfig = getEnvConfig()): Hono {
       return c.notFound();
     }
     const filePath = resolve(brandingImagesDir, filename);
-    if (!filePath.startsWith(brandingImagesDir)) return c.notFound();
+    if (
+      !filePath.startsWith(brandingImagesDir + sep) &&
+      filePath !== brandingImagesDir
+    ) {
+      return c.notFound();
+    }
     const file = Bun.file(filePath);
     if (!(await file.exists())) return c.notFound();
+    const ext = filename.split('.').pop()?.toLowerCase() ?? '';
+    const contentType = BRANDING_MIME[ext];
+    if (!contentType) return c.notFound();
     return new Response(file, {
-      headers: { 'Cache-Control': 'no-cache, must-revalidate' },
+      headers: {
+        'Cache-Control': 'no-cache, must-revalidate',
+        'Content-Type': contentType,
+      },
     });
   });
 
diff --git a/services/rag/app/routers/search.py b/services/rag/app/routers/search.py
index 4a99031154..08a5a55043 100644
--- a/services/rag/app/routers/search.py
+++ b/services/rag/app/routers/search.py
@@ -28,7 +28,7 @@ async def search(
     try:
         start_time = time.time()
 
-        results = await rag_service.search(
+        results, search_usage = await rag_service.search(
             org_slug,
             query=request.query,
             top_k=request.top_k,
@@ -51,8 +51,9 @@ async def search(
             for r in results
         ]
 
+        # `search_usage` is the per-call value returned alongside results
+        # — no shared singleton, no cross-request mis-attribution.
         usage = None
-        search_usage = getattr(rag_service, "last_search_usage", None)
         if search_usage:
             usage = UsageInfo(
                 input_tokens=search_usage.prompt_tokens,
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index a804e468f8..57cb60e9e2 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -60,6 +60,12 @@
 
 _CONFIG_CHECK_INTERVAL = 15  # seconds
 
+# Bound the per-org-lock dict so a misbehaving caller cannot grow the
+# table without limit by spraying random slugs. Real deployments have
+# tens, not thousands, of orgs; 256 is comfortably above any realistic
+# concurrent-init fan-out while still capping memory.
+_ORG_LOCKS_MAX = 256
+
 
 _background_tasks: set[asyncio.Task[None]] = set()
 
@@ -97,14 +103,15 @@ def __init__(self) -> None:
         self._init_lock = asyncio.Lock()
         self._pool: asyncpg.Pool | None = None
         # Embedding dimensions are pinned globally; see module docstring.
+        # `_pin_dim_lock` serializes the first-write race between two orgs
+        # initializing concurrently (which previously each held their own
+        # per-org lock and both raced past `if _pinned_dims is None`).
         self._pinned_dims: int | None = None
+        self._pin_dim_lock = asyncio.Lock()
         # Per-org client cache and per-org locks (so concurrent first-calls
         # for the same org don't both build clients).
         self._org_clients: dict[str, _OrgClients] = {}
         self._org_locks: dict[str, asyncio.Lock] = {}
-        # Per-search-call usage propagation — set by search(), read by
-        # generate(). Single-threaded asyncio so no need for per-org isolation.
-        self.last_search_usage: Any = None
 
     async def initialize(self) -> None:
         """Initialize the shared database pool.
@@ -134,6 +141,17 @@ def embedding_service(self) -> EmbeddingService | None:
     def _get_org_lock(self, org_slug: str) -> asyncio.Lock:
         lock = self._org_locks.get(org_slug)
         if lock is None:
+            # Bounded LRU eviction: never grow past `_ORG_LOCKS_MAX`. Evict
+            # the oldest entry (Python dicts preserve insertion order); the
+            # evicted lock is safe to drop because either no caller holds
+            # it (it was idle), or the caller still has a reference and
+            # will continue using it — we just lose the "shared lock"
+            # property for that org until the next call recreates it.
+            if len(self._org_locks) >= _ORG_LOCKS_MAX:
+                # `next(iter(...))` returns the oldest key without
+                # building a list.
+                oldest = next(iter(self._org_locks))
+                self._org_locks.pop(oldest, None)
             lock = asyncio.Lock()
             self._org_locks[org_slug] = lock
         return lock
@@ -191,21 +209,27 @@ async def _build_or_refresh_org_clients(
 
         _b, _a, _m, dims = settings.get_embedding_config(org_slug)
 
-        if self._pinned_dims is None:
-            self._pinned_dims = dims
-            await pin_embedding_dimensions(self._pool, dims)
-            logger.info(
-                "Pinned RAG embedding dimensions to {} (set by org '{}')",
-                dims,
-                org_slug,
-            )
-        elif dims != self._pinned_dims:
-            raise ValueError(
-                f"Org '{org_slug}' embedding dimensions ({dims}) do not match the "
-                f"pinned RAG schema dimensions ({self._pinned_dims}). All orgs "
-                f"sharing this RAG instance must use the same embedding model "
-                f"dimensions. Reconcile provider configs or run RAG per-org."
-            )
+        # Serialize the first-write so two concurrent org inits don't
+        # race past `_pinned_dims is None` with different dims and both
+        # call `pin_embedding_dimensions`. Subsequent calls take the
+        # lock too but find `_pinned_dims` already set and fall through
+        # to the mismatch check.
+        async with self._pin_dim_lock:
+            if self._pinned_dims is None:
+                self._pinned_dims = dims
+                await pin_embedding_dimensions(self._pool, dims)
+                logger.info(
+                    "Pinned RAG embedding dimensions to {} (set by org '{}')",
+                    dims,
+                    org_slug,
+                )
+            elif dims != self._pinned_dims:
+                raise ValueError(
+                    f"Org '{org_slug}' embedding dimensions ({dims}) do not match the "
+                    f"pinned RAG schema dimensions ({self._pinned_dims}). All orgs "
+                    f"sharing this RAG instance must use the same embedding model "
+                    f"dimensions. Reconcile provider configs or run RAG per-org."
+                )
 
         embedding_service = EmbeddingService(
             api_key=llm_config["embedding_api_key"],
@@ -322,10 +346,14 @@ async def search(
         top_k: int | None = None,
         similarity_threshold: float | None = None,
         file_ids: list[str] | None = None,
-    ) -> list[dict[str, Any]]:
+    ) -> tuple[list[dict[str, Any]], Any]:
         """Search the knowledge base using hybrid BM25 + vector search.
 
-        Embedding token usage available via `self.last_search_usage` after call.
+        Returns a `(results, embedding_usage)` tuple so the per-call
+        embedding usage is propagated alongside the results — earlier
+        this hung on a mutable `self.last_search_usage` attribute that
+        concurrent calls overwrote, mis-attributing tokens across
+        callers under any real QPS.
         """
         clients = await self._ensure_org_clients(org_slug)
 
@@ -338,8 +366,7 @@ async def search(
             top_k=effective_top_k,
             similarity_threshold=threshold,
         )
-
-        self.last_search_usage = getattr(clients.search_service, "last_search_usage", None)
+        usage = getattr(clients.search_service, "last_search_usage", None)
 
         # If no results and some files are still indexing, wait and retry once
         if not results and file_ids:
@@ -354,9 +381,9 @@ async def search(
                     top_k=effective_top_k,
                     similarity_threshold=threshold,
                 )
-                self.last_search_usage = getattr(clients.search_service, "last_search_usage", None)
+                usage = getattr(clients.search_service, "last_search_usage", None)
 
-        return results
+        return results, usage
 
     async def generate(
         self,
@@ -370,7 +397,7 @@ async def generate(
         try:
             start_time = time.time()
 
-            search_results = await self.search(org_slug, query, top_k=RAG_TOP_K, file_ids=file_ids)
+            search_results, embedding_usage = await self.search(org_slug, query, top_k=RAG_TOP_K, file_ids=file_ids)
 
             if not search_results:
                 return {
@@ -423,8 +450,10 @@ async def generate(
             processing_time = (time.time() - start_time) * 1000
             logger.info("Generation completed in {:.2f}ms", processing_time)
 
-            # Combine embedding usage (from search step) + LLM usage
-            embedding_usage = getattr(self, "last_search_usage", None)
+            # Combine embedding usage (from search step) + LLM usage.
+            # `embedding_usage` is the local var bound from `await
+            # self.search(...)` above, so this is correct under
+            # concurrent calls.
             embedding_tokens = embedding_usage.prompt_tokens if embedding_usage else 0
             llm_input = completion.usage.prompt_tokens if completion.usage else 0
             llm_output = completion.usage.completion_tokens if completion.usage else 0
@@ -769,6 +798,13 @@ async def shutdown(self) -> None:
                         exc_info=True,
                     )
         self._org_clients.clear()
+
+        # Drain pending `_safe_close` tasks so they don't keep running
+        # after the pool is closed. `return_exceptions=True` ensures one
+        # failing close doesn't prevent the others from being awaited.
+        if _background_tasks:
+            await asyncio.gather(*_background_tasks, return_exceptions=True)
+
         await close_pool()
         self.initialized = False
 
diff --git a/services/rag/tests/test_rag_service.py b/services/rag/tests/test_rag_service.py
index e5b26bccb7..568d33b8dd 100644
--- a/services/rag/tests/test_rag_service.py
+++ b/services/rag/tests/test_rag_service.py
@@ -209,9 +209,13 @@ async def test_delegates_to_search_service(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.0
-            results = await service.search(TEST_ORG, "test query", file_ids=["doc-1"])
+            results, usage = await service.search(TEST_ORG, "test query", file_ids=["doc-1"])
 
         assert len(results) == 2
+        # `search` returns a (results, usage) tuple now — the usage
+        # value is the per-call embedding usage attached to the search
+        # service, not a shared singleton.
+        assert usage is service._search_service.last_search_usage
         service._search_service.search.assert_awaited_once_with(
             "test query",
             file_ids=["doc-1"],
@@ -263,7 +267,7 @@ async def test_custom_threshold_overrides_settings(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.9
-            results = await service.search(TEST_ORG, "query", similarity_threshold=0.3)
+            results, _usage = await service.search(TEST_ORG, "query", similarity_threshold=0.3)
 
         assert len(results) == 1
 
@@ -278,7 +282,7 @@ async def test_zero_threshold_returns_all(self):
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
             mock_settings.similarity_threshold = 0.0
-            results = await service.search(TEST_ORG, "query")
+            results, _usage = await service.search(TEST_ORG, "query")
 
         assert len(results) == 1
 
@@ -317,10 +321,15 @@ async def test_generates_response_with_search_results(self):
                 service,
                 "search",
                 new_callable=AsyncMock,
-                return_value=[
-                    {"content": "Context chunk 1", "score": 0.9, "file_id": "d1"},
-                    {"content": "Context chunk 2", "score": 0.8, "file_id": "d2"},
-                ],
+                # `search` returns `(results, usage)` — usage is None
+                # here since we only care about the LLM completion side.
+                return_value=(
+                    [
+                        {"content": "Context chunk 1", "score": 0.9, "file_id": "d1"},
+                        {"content": "Context chunk 2", "score": 0.8, "file_id": "d2"},
+                    ],
+                    None,
+                ),
             ),
             patch("app.services.rag_service.settings") as mock_settings,
         ):
@@ -339,7 +348,7 @@ async def test_empty_search_results_returns_no_info_message(self):
             service,
             "search",
             new_callable=AsyncMock,
-            return_value=[],
+            return_value=([], None),
         ):
             result = await service.generate(TEST_ORG, "Unknown topic?")
 
@@ -364,7 +373,10 @@ async def test_llm_receives_system_prompt_and_context(self):
                 service,
                 "search",
                 new_callable=AsyncMock,
-                return_value=[{"content": "relevant info", "score": 0.9, "file_id": "d1"}],
+                return_value=(
+                    [{"content": "relevant info", "score": 0.9, "file_id": "d1"}],
+                    None,
+                ),
             ),
             patch("app.services.rag_service.settings") as mock_settings,
         ):
@@ -391,7 +403,10 @@ async def test_empty_llm_choices_raises(self):
                 service,
                 "search",
                 new_callable=AsyncMock,
-                return_value=[{"content": "info", "score": 0.9, "file_id": "d1"}],
+                return_value=(
+                    [{"content": "info", "score": 0.9, "file_id": "d1"}],
+                    None,
+                ),
             ),
             patch("app.services.rag_service.settings") as mock_settings,
         ):
@@ -413,7 +428,12 @@ async def test_context_truncated_at_max_chars(self):
         large_chunks = [{"content": "x" * 100_000, "score": 0.9 - i * 0.01, "file_id": f"d{i}"} for i in range(5)]
 
         with (
-            patch.object(service, "search", new_callable=AsyncMock, return_value=large_chunks),
+            patch.object(
+                service,
+                "search",
+                new_callable=AsyncMock,
+                return_value=(large_chunks, None),
+            ),
             patch("app.services.rag_service.settings") as mock_settings,
         ):
             mock_settings.get_llm_config.return_value = {"model": "m"}
@@ -426,7 +446,12 @@ async def test_context_truncated_at_max_chars(self):
     async def test_passes_file_ids_to_search(self):
         service = _make_service()
 
-        with patch.object(service, "search", new_callable=AsyncMock, return_value=[]) as mock_search:
+        with patch.object(
+            service,
+            "search",
+            new_callable=AsyncMock,
+            return_value=([], None),
+        ) as mock_search:
             await service.generate(TEST_ORG, "q", file_ids=["doc-1"])
 
         mock_search.assert_awaited_once()
@@ -447,7 +472,10 @@ async def test_none_content_from_llm_returns_empty_string(self):
                 service,
                 "search",
                 new_callable=AsyncMock,
-                return_value=[{"content": "info", "score": 0.9, "file_id": "d1"}],
+                return_value=(
+                    [{"content": "info", "score": 0.9, "file_id": "d1"}],
+                    None,
+                ),
             ),
             patch("app.services.rag_service.settings") as mock_settings,
         ):
diff --git a/tools/cli/src/commands/migrate.ts b/tools/cli/src/commands/migrate.ts
index d2e3b8cbde..6b1cb4175f 100644
--- a/tools/cli/src/commands/migrate.ts
+++ b/tools/cli/src/commands/migrate.ts
@@ -20,8 +20,8 @@ export function createMigrateCommand(): Command {
     .option('--dry-run', 'Preview moves without changing files', false)
     .option(
       '--cleanup-old',
-      'After verifying new == old (sha256), remove the old-path secrets. ' +
-        'Run only after the new deployment is healthy.',
+      'After verifying new == old (byte-for-byte), remove the old-path ' +
+        'secrets. Run only after the new deployment is healthy.',
       false,
     )
     .action(async (opts: { dryRun?: boolean; cleanupOld?: boolean }) => {
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 82ee5fcec0..9cfc2995d5 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -622,11 +622,10 @@ export async function deploy(options: DeployOptions): Promise<void> {
   }
 }
 
-// Org slug shape — must match validateOrgSlug at services/platform/convex/lib/file_io.ts.
+// Org slug shape — must match validateOrgSlug at services/platform/lib/shared/constants/org-slug.ts.
 // Duplicated here because the CLI ships in a single compiled binary that does
 // not import convex sources at runtime.
 const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
-const MAX_ORG_SLUG_LENGTH = 64;
 
 // Top-level names under the project root that are legitimate per-domain
 // dirs from the OLD flat layout (`agents/`, `workflows/`, …). Under
@@ -634,7 +633,7 @@ const MAX_ORG_SLUG_LENGTH = 64;
 // it's a legacy project that hasn't been re-init'd. Refuse to push (would
 // silently land in `/app/data/agents/` etc., which the new resolvers don't
 // read) and point the operator at `tale init --force`.
-const LEGACY_DOMAIN_DIR_NAMES = new Set([
+export const LEGACY_DOMAIN_DIR_NAMES = new Set([
   'agents',
   'workflows',
   'integrations',
@@ -645,11 +644,10 @@ const LEGACY_DOMAIN_DIR_NAMES = new Set([
 ]);
 
 function isValidOrgSlug(name: string): boolean {
-  return (
-    name.length > 0 &&
-    name.length <= MAX_ORG_SLUG_LENGTH &&
-    ORG_SLUG_REGEX.test(name)
-  );
+  // Mirrors `validateOrgSlug` in shared/constants/org-slug.ts — no length
+  // cap (the canonical validator imposes none, and adding one here would
+  // silently drop legitimate long slugs from compose mounts).
+  return name === 'default' || ORG_SLUG_REGEX.test(name);
 }
 
 async function findOrgDirs(
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
index 67e209e78e..6771dcdec0 100644
--- a/tools/cli/src/lib/actions/migrate-config-layout.ts
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -55,9 +55,17 @@ export async function migrateConfigLayout(
 
   const containerName = `${getProjectId()}-convex`;
   if (!(await isContainerRunning(containerName))) {
+    // Earlier the message said "e.g. `tale deploy`", but `tale deploy`
+    // now hard-fails on legacy layout — creating a deadlock for fresh
+    // upgrades where the operator stopped the convex container before
+    // running migrate. Point at `tale start` (which only fails when
+    // legacy layout is present at the host) and the docs runbook.
     throw new Error(
       `Convex container "${containerName}" is not running. ` +
-        'Start the platform first (e.g. `tale deploy`) before running this migration.',
+        'Start the OLD platform first (`tale start` or `docker compose start convex`) ' +
+        'so the migrate script can run against the still-mounted volume, then re-run ' +
+        '`tale migrate config-layout`. See docs/<locale>/self-hosted/operate/upgrades.md ' +
+        'for the full migrate → deploy → cleanup runbook.',
     );
   }
 
@@ -72,7 +80,7 @@ export async function migrateConfigLayout(
     logger.step(
       dryRun
         ? '[DRY-RUN] Cleanup-old: would verify and remove old-path secrets'
-        : 'Verifying + removing old-path secrets (sha-matched against new paths)...',
+        : 'Verifying + removing old-path secrets (byte-for-byte matched against new paths)...',
     );
   } else {
     logger.step(
@@ -98,7 +106,12 @@ export async function migrateConfigLayout(
     );
   }
   if (result.stderr) {
-    // Warnings printed to stderr (e.g. SKIP messages) are not fatal but worth surfacing.
+    // The script now sends only true `ERROR:` lines to stderr (SKIP
+    // notices go to stdout). On a clean run we still see nothing here;
+    // any non-empty stderr on success means the script encountered a
+    // recoverable conflict (dst collision, invalid slug shape) that
+    // didn't bump errors past zero — surface it loudly so the operator
+    // notices.
     logger.warn(result.stderr.trim());
   }
 }
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index a3e9699001..35cdf5fa62 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -48,6 +48,13 @@ export interface ReseedAllOrgsOptions {
  * `/app/`). No `cd /app/services/platform` — that path does not exist
  * at runtime.
  */
+const RESEED_TIMEOUT_S = 1800;
+const RESEED_TIMEOUT_EXIT = 124;
+
+// The shell pipeline appends `|| true` to the grep so a zero-match
+// outcome (grep exits 1) does not poison `set -o pipefail`. The real
+// signal is `bunx convex run`'s exit code, captured before the grep
+// strips banner lines.
 const RESEED_SCRIPT = `set -eo pipefail
 source /app/env.sh
 env_normalize_common
@@ -55,12 +62,12 @@ source /app/generate-admin-key.sh
 ensure_instance_secret
 ADMIN_KEY=$(generate_key "$INSTANCE_NAME" "$INSTANCE_SECRET")
 cd /app
-HOME=/home/app timeout 1800 bunx convex run \\
+HOME=/home/app timeout ${RESEED_TIMEOUT_S} bunx convex run \\
   organizations/reseed_all_orgs:reseedAllOrgsFromBuiltin \\
   --url "\${CONVEX_URL:-http://convex:3210}" \\
   --admin-key "$ADMIN_KEY" \\
   --no-push 2>&1 \\
-  | grep -v "^Admin key\\|^📋\\|^✅ Admin\\|^━\\|^🌐\\|^$\\|Steps:\\|Open\\|Enter\\|Paste"
+  | { grep -v "^Admin key\\|^📋\\|^✅ Admin\\|^━\\|^🌐\\|^$\\|Steps:\\|Open\\|Enter\\|Paste" || true; }
 `;
 
 const CONFIRM_MESSAGE =
@@ -80,36 +87,46 @@ type ReseedResult = {
 };
 
 /**
- * Extract the last JSON object from a stream of mixed-output stdout.
+ * Extract the trailing JSON object from a stream of mixed-output stdout.
  * `bunx convex run` prints `null` for void-returning actions or the
- * action's return value for value-returning ones. Either way, the JSON
- * payload is on its own line(s) at the very end.
+ * action's return value for value-returning ones. We want the LAST
+ * line(s) that form a parseable JSON object whose shape matches
+ * `ReseedResult` — not just "anything after the last `{`", which would
+ * mis-parse when per-org error strings include `{` (e.g. a JS object
+ * literal in an error message).
+ *
+ * Strategy:
+ *   1. Split into lines.
+ *   2. Walk backwards; for each starting line that begins with `{`,
+ *      try `JSON.parse(joinedSlice)`.
+ *   3. First parse that produces a shape-validated ReseedResult wins.
  */
 function parseTrailingJson(stdout: string): ReseedResult | null {
   const trimmed = stdout.trim();
   if (!trimmed) return null;
 
-  // Walk backwards from the end looking for the start of a JSON value.
-  // The action returns an object, so look for the matching `{`.
-  const lastBrace = trimmed.lastIndexOf('{');
-  if (lastBrace < 0) return null;
-
-  try {
-    const parsed = JSON.parse(trimmed.slice(lastBrace));
-    if (
-      parsed &&
-      typeof parsed === 'object' &&
-      typeof parsed.total === 'number' &&
-      typeof parsed.succeeded === 'number' &&
-      typeof parsed.failed === 'number' &&
-      Array.isArray(parsed.results)
-    ) {
-      return parsed as ReseedResult;
+  const lines = trimmed.split('\n');
+  for (let i = lines.length - 1; i >= 0; i--) {
+    const candidate = lines[i].trimStart();
+    if (!candidate.startsWith('{')) continue;
+    const slice = lines.slice(i).join('\n');
+    try {
+      const parsed: unknown = JSON.parse(slice);
+      if (
+        parsed &&
+        typeof parsed === 'object' &&
+        typeof (parsed as Record<string, unknown>).total === 'number' &&
+        typeof (parsed as Record<string, unknown>).succeeded === 'number' &&
+        typeof (parsed as Record<string, unknown>).failed === 'number' &&
+        Array.isArray((parsed as Record<string, unknown>).results)
+      ) {
+        return parsed as ReseedResult;
+      }
+    } catch {
+      // Not a complete JSON value starting at this line; try earlier.
     }
-    return null;
-  } catch {
-    return null;
   }
+  return null;
 }
 
 export async function reseedAllOrgsFromBuiltin(
@@ -164,6 +181,36 @@ export async function reseedAllOrgsFromBuiltin(
     if (result.stderr) {
       logger.error(result.stderr.trim());
     }
+
+    // Special-case `timeout(1)`'s SIGTERM exit so the operator sees
+    // "timed out" rather than a generic "raised". The action is
+    // idempotent so re-running is always safe.
+    if (result.exitCode === RESEED_TIMEOUT_EXIT) {
+      throw new Error(
+        `--override-all timed out after ${RESEED_TIMEOUT_S}s in ${container}. ` +
+          `The reseed action may still be running on the convex side; ` +
+          `wait a minute, then re-run (idempotent).`,
+      );
+    }
+
+    // Parse the trailing JSON payload on the failure branch too — the
+    // action emits it before throwing so per-org slug detail survives
+    // the non-zero exit and reaches CI logs as structured data.
+    const failed = parseTrailingJson(result.stdout);
+    if (failed) {
+      const failedSlugs = failed.results
+        .filter(
+          (r): r is { slug: string; status: 'error'; error: string } =>
+            r.status === 'error',
+        )
+        .map((r) => `${r.slug}: ${r.error.split('\n')[0]}`)
+        .join('; ');
+      throw new Error(
+        `--override-all failed: ${failed.failed}/${failed.total} orgs raised — ${failedSlugs}. ` +
+          `Re-run after addressing the listed orgs (the action is idempotent).`,
+      );
+    }
+
     throw new Error(
       `--override-all failed: reseed action raised in ${container}. ` +
         `Per-org detail above; partial state on disk — re-run --override-all ` +
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index 51e2e56572..97b35cb38e 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -16,6 +16,7 @@ import { exec } from '../docker/exec';
 import { findProject } from '../project/find-project';
 import { resolveOrAssignProjectContext } from '../project/project-context';
 import { withLock } from '../state/with-lock';
+import { LEGACY_DOMAIN_DIR_NAMES } from './deploy';
 import { init } from './init';
 
 async function assertDockerAvailable(): Promise<void> {
@@ -134,49 +135,38 @@ export async function start(options: StartOptions): Promise<void> {
     }
   }
 
+  // Environment setup runs unconditionally so `tale start` after a CLI
+  // upgrade that introduces a new auto-secret (e.g. SANDBOX_TOKEN) picks
+  // it up before compose starts — matches `tale deploy` semantics so
+  // both commands give the same surface behavior.
   const envPath = join(projectDir, '.env');
-  if (!existsSync(envPath)) {
-    logger.warn('No .env file found. Running environment setup...');
-    logger.blank();
-    const { ensureEnv } = await import('../config/ensure-env');
-    const { success } = await ensureEnv({ deployDir: projectDir });
-    if (!success) {
-      throw new Error(
-        'Environment setup failed. Cannot start without .env file.',
-      );
-    }
+  const { ensureEnv } = await import('../config/ensure-env');
+  const { success: envOk } = await ensureEnv({ deployDir: projectDir });
+  if (!envOk) {
+    throw new Error(
+      `Environment setup failed. Cannot start without ${envPath}.`,
+    );
   }
 
   // Detect legacy flat-layout dirs at the project root (`agents/`,
-  // `workflows/`, …). Under the org-first layout these belong under
-  // `default/<domain>/` instead — the platform's resolvers won't read
-  // anything at the old paths. Surface the runbook so the operator
-  // doesn't boot into a "nothing's working" state.
-  const LEGACY_FLAT_DOMAINS = [
-    'agents',
-    'workflows',
-    'integrations',
-    'branding',
-    'providers',
-    'skills',
-  ];
-  const legacyDirsFound = LEGACY_FLAT_DOMAINS.filter((d) =>
+  // `workflows/`, …, `retention/`). Under the org-first layout these
+  // belong under `default/<domain>/` — the platform's resolvers won't
+  // read anything at the old paths. Same constant + same hard-fail as
+  // `tale deploy`: both commands either accept or refuse the layout
+  // identically. (Earlier this file warn-and-proceeded, which let a
+  // project pass `tale start` but fail `tale deploy`.)
+  const legacyDirsFound = [...LEGACY_DOMAIN_DIR_NAMES].filter((d) =>
     existsSync(join(projectDir, d)),
   );
   if (legacyDirsFound.length > 0) {
-    logger.warn(
-      `Legacy flat layout detected at project root: ${legacyDirsFound.map((d) => `${d}/`).join(', ')}`,
-    );
-    logger.info(
-      '  The org-first layout expects these under `default/<domain>/` (or another org subtree).',
-    );
-    logger.info(
-      '  Migrate with: `tale migrate config-layout` then `tale deploy --override-all -y`.',
+    throw new Error(
+      `Legacy flat layout detected at project root: ${legacyDirsFound
+        .map((d) => `${d}/`)
+        .join(', ')}\n` +
+        '  The org-first layout expects these under `default/<domain>/` (or another org subtree).\n' +
+        '  Migrate with: `tale migrate config-layout` then `tale deploy --override-all -y`.\n' +
+        '  See docs/<locale>/self-hosted/operate/upgrades.md for the full runbook.',
     );
-    logger.info(
-      '  See docs/<locale>/self-hosted/operate/upgrades.md for the full runbook.',
-    );
-    logger.blank();
   }
 
   await assertDockerAvailable();
diff --git a/tools/cli/src/lib/migrate-config-layout/script.sh b/tools/cli/src/lib/migrate-config-layout/script.sh
index e69cf4c1e2..da030cf639 100644
--- a/tools/cli/src/lib/migrate-config-layout/script.sh
+++ b/tools/cli/src/lib/migrate-config-layout/script.sh
@@ -57,10 +57,13 @@ copy_secret() {
   local dst_dir; dst_dir="$(dirname "$dst")"
   if [ -e "$dst" ]; then
     if cmp -s "$src" "$dst" 2>/dev/null; then
+      # SKIP belongs to stdout (informational, expected on re-run);
+      # only true ERROR lines go to stderr so the CLI wrapper can
+      # distinguish noise from real failures.
       skipped=$((skipped+1)); echo "SKIP (already migrated): $src"
       return 0
     else
-      conflicts+=("$src ≠ $dst")
+      conflicts+=("$src != $dst")
       errors=$((errors+1))
       echo "ERROR: $dst exists but differs from $src; refusing to overwrite" >&2
       return 0
@@ -90,7 +93,7 @@ remove_old_secret() {
     return 0
   fi
   if ! cmp -s "$old" "$new" 2>/dev/null; then
-    conflicts+=("$old ≠ $new")
+    conflicts+=("$old != $new")
     errors=$((errors+1))
     echo "ERROR: $old and $new differ; refusing to remove $old" >&2
     return 0
@@ -108,6 +111,27 @@ remove_old_secret() {
 # ---------------------------------------------------------------------------
 # Enumeration
 # ---------------------------------------------------------------------------
+
+# Pre-scan: flag when both the flat path (providers/foo.secrets.json)
+# and the nested path (providers/default/foo.secrets.json) would map to
+# the same destination. Without this, copy_secret's per-pair cmp -s
+# would surface only one of the two as an error, leaving the operator
+# guessing which source was the "real" one.
+detect_default_dst_collisions() {
+  [ -d "$DATA/providers/default" ] || return 0
+  for f in "$DATA"/providers/*.secrets.json; do
+    [ -f "$f" ] || continue
+    local base nested
+    base="$(basename "$f")"
+    nested="$DATA/providers/default/$base"
+    if [ -f "$nested" ]; then
+      conflicts+=("dst collision: $f and $nested both map to $DATA/default/providers/$base")
+      errors=$((errors+1))
+      echo "ERROR: $f and $nested both target $DATA/default/providers/$base; manual reconcile required" >&2
+    fi
+  done
+}
+
 process_secret() {
   local src="$1" dst="$2"
   if [ "$CLEANUP_OLD" = 1 ]; then
@@ -117,6 +141,8 @@ process_secret() {
   fi
 }
 
+detect_default_dst_collisions
+
 # Default org: top-level $DATA/providers/*.secrets.json → $DATA/default/providers/
 if [ -d "$DATA/providers" ]; then
   for f in "$DATA"/providers/*.secrets.json; do
@@ -133,13 +159,18 @@ if [ -d "$DATA/providers" ]; then
     case "$org" in
       .*) continue ;;
     esac
-    # Validate against ORG_SLUG_REGEX (keep in sync with validateOrgSlug
-    # at services/platform/convex/lib/file_io.ts). Anything that doesn't
-    # match is skipped with a warning — defends against `.history` or
-    # future hidden markers leaking into the iteration.
-    if ! [[ "$org" =~ ^[a-z0-9][a-z0-9_-]{0,63}$ ]]; then
-      echo "SKIP (not a valid org slug): $org" >&2
-      skipped=$((skipped+1))
+    # Validate against ORG_SLUG_REGEX (keep in sync with
+    # services/platform/lib/shared/constants/org-slug.ts). No length
+    # cap here — the canonical validator imposes none, and silently
+    # dropping long-but-valid slugs would lose their secrets on
+    # --cleanup-old. Anything that fails the shape is recorded as an
+    # error + conflict so the summary surfaces it (legacy slugs from a
+    # prior, more-permissive regime get an actionable diagnostic
+    # rather than disappearing).
+    if ! [[ "$org" =~ ^[a-z0-9][a-z0-9_-]*$ ]]; then
+      conflicts+=("invalid org slug under providers/: $org")
+      errors=$((errors+1))
+      echo "ERROR: providers/$org/ has invalid slug shape; manual reconcile required" >&2
       continue
     fi
     for f in "$d"*.secrets.json; do

From 68bffa6b80b6a4ab68cdfce3bd7ba61c4ef819e9 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 15:06:54 +0800
Subject: [PATCH 05/41] docs(docs): remove version-specific org-first migration
 runbook

The upgrades.md page is the evergreen operate doc (two-step flow,
blue-green, rollback, semver). A one-time migration between specific
versions belongs in that release's Migration notes, not here.
---
 docs/de/self-hosted/operate/upgrades.md | 42 -------------------------
 docs/en/self-hosted/operate/upgrades.md | 41 ------------------------
 docs/fr/self-hosted/operate/upgrades.md | 42 -------------------------
 3 files changed, 125 deletions(-)

diff --git a/docs/de/self-hosted/operate/upgrades.md b/docs/de/self-hosted/operate/upgrades.md
index 887b37d01e..b2a3a38b3a 100644
--- a/docs/de/self-hosted/operate/upgrades.md
+++ b/docs/de/self-hosted/operate/upgrades.md
@@ -80,45 +80,3 @@ Minor-Versionen zu überspringen (von 0.9 auf 0.11 zu gehen) ist unterstützt, s
 ## Wo das hingehört
 
 Der Upgrade-Flow knüpft jede andere Operate-Seite an — Backups sind das, was ein gescheitertes Upgrade wiederherstellbar macht, Observability ist das, was dir sagt, dass die neue Farbe healthy ist, Hardening ist das, was du nach einer Major-Version neu durchgehst. Setzt du das CLI zum ersten Mal auf, deckt [Tale-CLI installieren](/de/self-hosted/install/cli-install) das workstationseitige Setup ab; nimmst du den Pager mitten im Rollout auf, nennt [Troubleshooting](/de/self-hosted/operate/observability/troubleshooting) die Symptome.
-
-## Migration auf das Org-first-Config-Layout
-
-Ältere Tale-Releases haben Config in einem flachen Baum im Workspace-Root abgelegt (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Aktuelles Tale nutzt ein **Org-first**-Layout, in dem jede Org — auch die kanonische `default` — ihren eigenen Unterbaum besitzt: `<root>/<org>/<domain>/...`. Die Migration ist opt-in und läuft einmal pro Workspace. Die neue Plattform liest die alten Pfade nicht mehr; bis du migrierst, liegen Provider-Secrets und Anpassungen in Verzeichnissen, die das Runtime nicht mehr anschaut.
-
-Die Migration sind drei Kommandos. Für Schritt 1 muss der Convex-Container vom **alten** Image noch laufen — halt die Plattform auf der alten Version online und führe Schritt 1 gegen diesen laufenden Container aus, bevor du upgradest.
-
-```bash
-# 1. Provider-Secrets aus dem flachen Layout nach
-#    `default/providers/...` kopieren. cp statt mv, damit die alten
-#    Pfade für einen möglichen Rollback intakt bleiben. Scope sind
-#    ausschließlich Provider-Secrets; alle anderen Domains (agents,
-#    workflows, integrations, skills, branding, retention) werden in
-#    Schritt 2 server-seitig aus dem Builtin-Katalog re-seedet.
-tale migrate config-layout
-
-# 2. Convex-Container gegen das Org-first-Volume-Layout neu erstellen
-#    und den server-seitigen Reseed über jede registrierte Org laufen
-#    lassen. Impliziert `--all`; `-y` überspringt den destruktiven
-#    Bestätigungs-Prompt für CI / Skript-Läufe.
-tale deploy --override-all -y
-
-# 3. Wenn du das neue Layout verifiziert hast, alte Pfade entfernen.
-#    Verifiziert byte-für-byte, dass die neue Datei der alten
-#    entspricht, bevor unlink; bei Mismatch wird das Löschen
-#    verweigert.
-tale migrate config-layout --cleanup-old
-```
-
-Schritt 1 ist safe und reversibel — ein Re-Run ist no-op, sobald Pfade existieren. Schritt 2 ist destruktiv: jede Org-Config mit Katalog-Name (`*.json` unter `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) wird mit dem Builtin-Katalog überschrieben. `*.secrets.json`-Dateien, `.history/`-Trails und hochgeladene `branding/images/*` bleiben server-seitig erhalten. Nach Schritt 2 liest die Plattform ausschließlich aus dem Org-first-Layout.
-
-Schritt 3 ist der Point-of-no-Return für Downgrades — siehe unten.
-
-### Org-first-Migration zurückrollen
-
-Zwischen Schritt 1 und 3 kannst du sauber downgraden. Der Convex-Entrypoint markiert jeden Seed-Lauf mit einem Token, das die Layout-Version enthält (`.seeded-<version>-orgfirst`); ein älteres Binary, das diesen Token nicht erkennt, re-seedet idempotent in seine eigenen (flachen) Pfade, und Schritt 1's `cp` hat die alten Pfade intakt gelassen. Downgrade ist ein normales `tale rollback`.
-
-Nach Schritt 3 (`--cleanup-old`) sind die alten Pfade weg. Downgrade re-seedet das Layout zwar weiterhin korrekt via Marker-Token-Mechanismus, aber die App startet mit leeren Provider-Secrets — stelle sie aus dem Backup wieder her (siehe [Backups und Restore](/de/self-hosted/operate/backups-and-restore)), bevor du Traffic wieder aufnimmst.
-
-### Was, wenn ich Schritt 1 überspringe?
-
-`tale deploy` und `tale start` verweigern beide den Start, wenn sie übrig gebliebene flache Layout-Dirs (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) im Workspace-Root finden. Der Fehler nennt die betroffenen Verzeichnisse und verweist auf dieses Runbook. Die Korrektur sind Schritt 1 + 2 in dieser Reihenfolge; es gibt keinen "trotzdem deployen und Legacy-Pfade ignorieren"-Modus — die Runtime-Resolver lesen diese Pfade nicht, ein Boot ohne Migration würde die Plattform also mit leerer Config zurücklassen.
diff --git a/docs/en/self-hosted/operate/upgrades.md b/docs/en/self-hosted/operate/upgrades.md
index b2c8864ecd..09e9d0c993 100644
--- a/docs/en/self-hosted/operate/upgrades.md
+++ b/docs/en/self-hosted/operate/upgrades.md
@@ -80,44 +80,3 @@ Skipping minor versions (going from 0.9 to 0.11) is supported as long as the int
 ## Where this fits
 
 The upgrade flow ties together every other operate page — backups are what makes a failed upgrade recoverable, observability is what tells you the new colour is healthy, hardening is what you re-walk after a major version. If you are setting up the CLI for the first time, [Install the tale CLI](/self-hosted/install/cli-install) covers the workstation-side setup; if you are picking up the pager mid-rollout, [Troubleshooting](/self-hosted/operate/observability/troubleshooting) names the symptoms.
-
-## Migrating to the org-first config layout
-
-Older Tale releases stored config in a flat tree at the workspace root (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). Current Tale uses an **org-first** layout where every org — including the canonical `default` — owns its own subtree: `<root>/<org>/<domain>/...`. The migration is opt-in and runs once per workspace. The new platform refuses to read the legacy paths; until you migrate, your provider secrets and customizations live in directories the runtime no longer looks at.
-
-The migration is three commands. The convex container from the **old** image must still be running for step 1 — keep the platform up on the old version, then run step 1 against that running container before upgrading.
-
-```bash
-# 1. Copy provider secrets from the flat layout into
-#    `default/providers/...`. cp not mv, so the old paths stay intact
-#    in case you need to roll back. Scope is provider secrets only;
-#    every other domain (agents, workflows, integrations, skills,
-#    branding, retention) is re-seeded server-side by step 2 from the
-#    builtin catalog.
-tale migrate config-layout
-
-# 2. Recreate the Convex container against the org-first volume layout
-#    and run the server-side reseed across every registered org. Implies
-#    `--all`; `-y` skips the destructive-write confirmation prompt for
-#    CI / scripted runs.
-tale deploy --override-all -y
-
-# 3. Once you have verified the new layout is intact, remove the legacy
-#    paths. Verifies that the new file matches the old byte-for-byte
-#    before unlinking; refuses to delete on any mismatch.
-tale migrate config-layout --cleanup-old
-```
-
-Step 1 alone is safe and reversible — re-running it is a no-op once paths exist. Step 2 is destructive: every org's catalog-named config (`*.json` under `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) is overwritten with the builtin catalog. `*.secrets.json` files, `.history/` trails, and uploaded `branding/images/*` are preserved server-side. After step 2, the platform reads exclusively from the org-first layout.
-
-Step 3 is the point of no return for downgrades — see below.
-
-### Rolling back the org-first migration
-
-Between steps 1 and 3 you can downgrade cleanly. The Convex entrypoint marks each seed run with a token that includes the layout version (`.seeded-<version>-orgfirst`); an older binary that does not recognize the token re-seeds idempotently into its own (flat) paths, and step 1's `cp` left the legacy paths intact. Downgrade is a normal `tale rollback`.
-
-After step 3 (`--cleanup-old`), the legacy paths are gone. Downgrade still re-seeds layout correctly via the marker token mechanism, but the app boots with empty provider secrets — restore them from backup (see [Backups and restore](/self-hosted/operate/backups-and-restore)) before resuming traffic.
-
-### What if I skip step 1?
-
-`tale deploy` and `tale start` both refuse to run when they detect leftover flat-layout dirs (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) at the workspace root. The error names the offending directories and points at this runbook. The fix is steps 1 + 2 in order; there is no "deploy anyway and ignore the legacy paths" mode — the runtime resolvers do not read those paths, so booting without migrating would leave the platform with empty config.
diff --git a/docs/fr/self-hosted/operate/upgrades.md b/docs/fr/self-hosted/operate/upgrades.md
index 9ce64839f3..621d16424f 100644
--- a/docs/fr/self-hosted/operate/upgrades.md
+++ b/docs/fr/self-hosted/operate/upgrades.md
@@ -80,45 +80,3 @@ Sauter des versions mineures (passer de 0.9 à 0.11) est supporté tant que les
 ## Où cela s'inscrit
 
 Le flow de montée de version noue chaque autre page d'exploitation — les backups sont ce qui rend une montée de version échouée récupérable, l'observabilité est ce qui te dit que la nouvelle couleur est saine, le durcissement est ce que tu reparcours après une version majeure. Si tu mets en place la CLI pour la première fois, [Installer la CLI tale](/fr/self-hosted/install/cli-install) couvre le setup côté workstation ; si tu prends le pager en plein rollout, [Dépannage](/fr/self-hosted/operate/observability/troubleshooting) nomme les symptômes.
-
-## Migration vers la disposition de config org-first
-
-Les anciennes versions de Tale stockaient la config dans une arborescence plate à la racine du workspace (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`). La version actuelle utilise une disposition **org-first** où chaque org — y compris la canonique `default` — possède son propre sous-arbre : `<root>/<org>/<domain>/...`. La migration est opt-in et tourne une seule fois par workspace. La nouvelle plateforme refuse de lire les anciens chemins ; tant que tu n'as pas migré, tes secrets de provider et personnalisations vivent dans des répertoires que le runtime ne regarde plus.
-
-La migration tient en trois commandes. Pour l'étape 1, le conteneur Convex de l'**ancienne** image doit encore tourner — garde la plateforme en ligne sur l'ancienne version et lance l'étape 1 contre ce conteneur en cours avant de monter de version.
-
-```bash
-# 1. Copier les secrets de provider depuis la disposition plate vers
-#    `default/providers/...`. cp et non mv, donc les anciens chemins
-#    restent intacts au cas où un rollback serait nécessaire. Le scope
-#    couvre uniquement les secrets de provider ; tous les autres
-#    domaines (agents, workflows, integrations, skills, branding,
-#    retention) sont re-seedés côté serveur à l'étape 2 depuis le
-#    catalogue builtin.
-tale migrate config-layout
-
-# 2. Recréer le conteneur Convex contre la disposition de volume org-first
-#    et lancer le reseed côté serveur sur chaque org enregistrée. Implique
-#    `--all` ; `-y` saute le prompt destructif pour les runs CI / scripts.
-tale deploy --override-all -y
-
-# 3. Une fois la nouvelle disposition vérifiée intacte, supprimer les
-#    anciens chemins. Vérifie byte-à-byte que le nouveau fichier
-#    correspond à l'ancien avant unlink ; refuse de supprimer en cas
-#    de mismatch.
-tale migrate config-layout --cleanup-old
-```
-
-L'étape 1 est sûre et réversible — la rejouer est un no-op une fois les chemins existants. L'étape 2 est destructive : chaque config d'org au nom canonique (`*.json` sous `agents/`, `workflows/`, `integrations/`, `skills/`, `branding/branding.json`, `retention.json`) est écrasée par le catalogue builtin. Les fichiers `*.secrets.json`, les traces `.history/` et les `branding/images/*` uploadés sont préservés côté serveur. Après l'étape 2, la plateforme lit exclusivement depuis la disposition org-first.
-
-L'étape 3 est le point de non-retour pour les downgrades — voir ci-dessous.
-
-### Annuler la migration org-first
-
-Entre les étapes 1 et 3, tu peux downgrader proprement. L'entrypoint Convex marque chaque run de seed avec un token qui inclut la version de disposition (`.seeded-<version>-orgfirst`) ; un binaire plus ancien qui ne reconnaît pas ce token re-seede idempotemment dans ses propres chemins (plats), et le `cp` de l'étape 1 a laissé les anciens chemins intacts. Le downgrade est un `tale rollback` normal.
-
-Après l'étape 3 (`--cleanup-old`), les anciens chemins sont partis. Le downgrade continue à re-seeder la disposition correctement via le mécanisme du token-marker, mais l'app démarre avec des secrets de provider vides — restaure-les depuis le backup (voir [Backups et restauration](/fr/self-hosted/operate/backups-and-restore)) avant de reprendre le trafic.
-
-### Et si je saute l'étape 1 ?
-
-`tale deploy` et `tale start` refusent tous les deux de démarrer s'ils détectent des répertoires restants de la disposition plate (`agents/`, `workflows/`, `integrations/`, `branding/`, `providers/`, `skills/`, `retention/`) à la racine du workspace. L'erreur nomme les répertoires concernés et pointe vers ce runbook. La correction reste les étapes 1 + 2 dans cet ordre ; il n'existe pas de mode « déploie quand même et ignore les chemins legacy » — les résolveurs runtime ne lisent pas ces chemins, donc démarrer sans migrer laisserait la plateforme avec une config vide.

From 14f80011992954211d95ed15acc9756c2c9c4c0c Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 15:14:55 +0800
Subject: [PATCH 06/41] chore(platform,cli): drop unused exports flagged by
 knip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Knip flagged three exports that are only consumed within their own
file: RESERVED_ORG_SLUGS (used by isReservedOrgSlug in the same
module), and the MigrateConfigLayoutOptions / ReseedAllOrgsOptions
interfaces (only the function parameter type — never imported).
---
 services/platform/lib/shared/constants/reserved-org-slugs.ts | 2 +-
 tools/cli/src/lib/actions/migrate-config-layout.ts           | 2 +-
 tools/cli/src/lib/actions/reseed-all-orgs.ts                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/platform/lib/shared/constants/reserved-org-slugs.ts b/services/platform/lib/shared/constants/reserved-org-slugs.ts
index e2a84342a6..17a5554bce 100644
--- a/services/platform/lib/shared/constants/reserved-org-slugs.ts
+++ b/services/platform/lib/shared/constants/reserved-org-slugs.ts
@@ -12,7 +12,7 @@
  * organization form — kept in `lib/shared/constants/` so it stays
  * Node-runtime-neutral.
  */
-export const RESERVED_ORG_SLUGS: ReadonlySet<string> = new Set(['default']);
+const RESERVED_ORG_SLUGS: ReadonlySet<string> = new Set(['default']);
 
 export function isReservedOrgSlug(slug: string): boolean {
   return RESERVED_ORG_SLUGS.has(slug.toLowerCase());
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
index 6771dcdec0..37def9547a 100644
--- a/tools/cli/src/lib/actions/migrate-config-layout.ts
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -26,7 +26,7 @@ import * as logger from '../../utils/logger';
 import { exec } from '../docker/exec';
 import { isContainerRunning } from '../docker/is-container-running';
 
-export interface MigrateConfigLayoutOptions {
+interface MigrateConfigLayoutOptions {
   dryRun: boolean;
   cleanupOld: boolean;
 }
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index 35cdf5fa62..b11dfc1fa4 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -26,7 +26,7 @@ import * as logger from '../../utils/logger';
 import { exec } from '../docker/exec';
 import { findPlatformContainer } from '../docker/find-platform-container';
 
-export interface ReseedAllOrgsOptions {
+interface ReseedAllOrgsOptions {
   dryRun: boolean;
   assumeYes: boolean;
 }

From 7cb91dbbf7c41032d1712ce5ea70031218593735 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 20:48:22 +0800
Subject: [PATCH 07/41] feat(rag,platform): enforce per-tenant org_slug at the
 RAG data layer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the P0-1 cross-tenant search leak surfaced by the second-round
review (workflow `search` with `fileIds: []` could return chunks across
orgs because RAG's `_build_scope_clause` dropped the WHERE filter when
file_ids was empty), plus the SemanticCache same-similarity leak that
bypassed the SQL filter entirely. Per-tenant isolation no longer
depends on callers passing the right `file_ids` — the data layer
physically partitions every row by `org_slug`.

Schema migration (services/rag/migrations/20260528000001_enforce_org_slug.sql)

- Idempotent forward-only DDL: every step guards on
  information_schema / pg_constraint so re-running is a no-op
  (protects manual `psql -f`, partial-failure recovery, and
  backup-restore drift).
- Rename `documents.team_id` and `chunks.team_id` → `org_slug`;
  backfill any NULL rows to `'default'`; SET NOT NULL DEFAULT
  'default'. Existing single-org deployments transparently land in
  the `default` bucket.
- Replace stale partial indexes with org-scoped covering indexes:
  UNIQUE `(org_slug, file_id)`, `(org_slug, document_id)`, and
  `semantic_cache(org_slug, expires_at)`.
- Composite FK `chunks.(document_id, org_slug) → documents.(id,
  org_slug) ON DELETE CASCADE` enforces at the DB that
  chunks.org_slug can never drift from documents.org_slug.
- Add `semantic_cache.org_slug` column + companion index.

RAG service — SQL scoped end-to-end by org_slug

- indexing_service.py: thread org_slug through `index_document`,
  `find_existing_by_hash`, `clone_from_existing`, `_do_store`,
  `_do_clone`, `_update_progress`. INSERT now writes org_slug;
  UPSERT conflict target is `(org_slug, file_id)`. Cross-org
  content-hash dedup is deliberately disabled — org B's secret
  upload can no longer be probed by hash from org A.
- search_service.py: `_build_scope_clause` ALWAYS emits
  `AND c.org_slug = $N`. Empty/None file_ids no longer drops the
  WHERE clause (closes P0-1). The documents subquery is independently
  org-scoped — defense in depth. `RagSearchService.search` returns
  `(results, usage)` directly; the `last_search_usage` singleton
  race is gone.
- semantic_cache.py: lookup/store/invalidate/cleanup all take
  org_slug. Two orgs asking semantically identical questions get
  independent cache entries. `cleanup(None)` retained for operator-
  side global GC but callers must pass None explicitly.
- rag_service.py: every public method takes org_slug as first arg;
  `delete_document` / `get_document_content` / `get_document_statuses`
  / `compare_documents` are now per-tenant (foreign-org file_id
  returns 0 deletes / 404 rather than touching the foreign row).
- routers/documents.py: `/documents/{id}` DELETE, `/{id}/content`,
  `/statuses`, `/compare` now require `Depends(require_org_slug)`.
  The pre-existing org-agnostic carve-out is gone — the data layer
  needs the slug, so the routes do too.

Platform-side adaptation — orgSlug threaded to ~12 call sites

- `deleteDocumentById({orgSlug, fileId})` + `deleteFromRagBatch(
  {orgSlug, fileIds})` signatures broken; callers in rag_action,
  agents/internal_actions, threads/cascade_helpers, governance
  (erasure ×3, retention_cleanup ×2), documents/internal_actions ×2
  all updated.
- `fetchDocumentContent(orgSlug, ...)`, `fetchDocumentComparison(
  orgSlug, ...)`, `fetchDocumentChunks(orgSlug, ...)` — broken
  signatures + caller updates in workflow document_action,
  retrieve_document tool, etc.
- `filterStorageIdsByCallerOrg` now returns `{storageId,
  organizationId}` pairs so `checkFileRagStatuses` groups by org and
  fans out one RAG call per tenant instead of one global call.
- `deleteKnowledgeFileFromRag` takes organizationId; the scheduler
  callsites in agents/mutations.ts pass it.

Sanitize util renamed: `sanitize_team_id` → `sanitize_org_slug`. No
back-compat shim per the no-backwards-compat-hacks rule.

Tests

- All 30+ touched existing RAG tests updated for the new signatures
  + new SQL parameter positions ($1 = org_slug now).
- New `test_org_isolation.py` (10 cases) + `test_semantic_cache_
  isolation.py` (5 cases) pin the invariant at the application layer:
  empty file_ids never drops the org filter, foreign-org delete
  returns 0, foreign-org content returns None, same-hash cross-org
  probe returns None, etc.
- Platform `document_retrieve_tool.test.ts` mock adds
  `components.betterAuth.adapter` so the new `orgSlugFromId` resolves
  inside the test sandbox.

Verification

- RAG: 313 tests pass (was 298; +15 isolation tests).
- Platform: 274 files / 70941 tests pass; lint clean (2751 files).
- `bun run check`: 36/36 tasks green.

Out of scope (deliberate)

- `RAG_AUTH_TOKEN` enforcement on the RAG side — platform-only auth
  boundary per the project briefing.
- Embedding-dimensions per-org pin — orthogonal; cross-org dim
  mismatch is still a fail-loud availability issue, not a leak.
- Removing the `team_id` column historical references in
  `services/rag/app/utils/sanitize.py` callers — the function has
  no callers, rename is clean.
---
 .../documents/document_retrieve_tool.test.ts  |  13 ++
 .../fetch_document_comparison.test.ts         |  67 ++++--
 .../documents/fetch_document_content.test.ts  |  41 ++--
 .../helpers/fetch_document_comparison.ts      |   5 +
 .../helpers/fetch_document_content.ts         |  12 +-
 .../documents/helpers/retrieve_document.ts    |   4 +-
 .../rag/helpers/fetch_document_chunks.ts      |   3 +-
 .../convex/agents/internal_actions.ts         |   8 +-
 services/platform/convex/agents/mutations.ts  |   4 +-
 .../convex/documents/internal_actions.ts      |  29 ++-
 .../platform/convex/file_metadata/actions.ts  |  79 ++++---
 .../convex/file_metadata/internal_queries.ts  |  17 +-
 .../platform/convex/governance/erasure.ts     |  10 +-
 .../convex/governance/retention_cleanup.ts    |  17 +-
 .../convex/threads/cascade_helpers.ts         |   7 +-
 .../action_defs/document/document_action.ts   |  15 +-
 .../rag/helpers/delete_document.test.ts       |   7 +-
 .../rag/helpers/delete_document.ts            |  21 +-
 .../action_defs/rag/rag_action.ts             |  29 ++-
 services/rag/app/routers/documents.py         |  70 +++---
 services/rag/app/services/indexing_service.py | 117 ++++++----
 services/rag/app/services/rag_service.py      | 123 ++++++-----
 services/rag/app/services/search_service.py   |  92 +++++---
 services/rag/app/services/semantic_cache.py   | 107 ++++++---
 services/rag/app/utils/sanitize.py            |  16 +-
 .../20260528000001_enforce_org_slug.sql       | 138 ++++++++++++
 services/rag/tests/test_background_ingest.py  |  13 +-
 services/rag/tests/test_document_content.py   |  25 ++-
 services/rag/tests/test_file_dates.py         |  27 ++-
 services/rag/tests/test_indexing_service.py   |  28 ++-
 services/rag/tests/test_org_isolation.py      | 208 ++++++++++++++++++
 services/rag/tests/test_rag_service.py        |  55 +++--
 services/rag/tests/test_search_service.py     |  69 +++---
 .../tests/test_semantic_cache_isolation.py    | 145 ++++++++++++
 34 files changed, 1241 insertions(+), 380 deletions(-)
 create mode 100644 services/rag/migrations/20260528000001_enforce_org_slug.sql
 create mode 100644 services/rag/tests/test_org_isolation.py
 create mode 100644 services/rag/tests/test_semantic_cache_isolation.py

diff --git a/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts b/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
index 00c6284b27..b74d5adfd5 100644
--- a/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
+++ b/services/platform/convex/agent_tools/documents/document_retrieve_tool.test.ts
@@ -18,6 +18,15 @@ vi.mock('../../_generated/api', () => ({
       },
     },
   },
+  // `orgSlugFromId` (called inside retrieveDocument before forwarding
+  // the request to RAG) hits `components.betterAuth.adapter.findOne`.
+  components: {
+    betterAuth: {
+      adapter: {
+        findOne: 'mock-better-auth-find-one',
+      },
+    },
+  },
 }));
 
 vi.mock('../../lib/helpers/rag_config', () => ({
@@ -41,6 +50,7 @@ function createMockCtx(overrides?: Record<string, unknown>) {
       title: 'Test',
     }) // findDocumentByFileId
     .mockResolvedValueOnce(['doc123', 'doc456']) // getAccessibleDocumentIds
+    .mockResolvedValueOnce({ slug: 'org-1' }) // orgSlugFromId → betterAuth findOne
     .mockResolvedValueOnce([]); // lookupVideoLinkSources — no video-link metadata
   return {
     organizationId: 'org1',
@@ -214,6 +224,7 @@ describe('retrieveDocument helper', () => {
           storageId: 'chat-upload-1',
           ragStatus: 'completed',
         }) // getByStorageId — chat attachment, indexed
+        .mockResolvedValueOnce({ slug: 'org-1' }) // orgSlugFromId → betterAuth findOne
         .mockResolvedValueOnce([]), // lookupVideoLinkSources
     });
 
@@ -297,6 +308,7 @@ describe('retrieveDocument helper', () => {
         storageId: 'chat-upload-1',
         ragStatus: 'completed',
       }) // getByStorageId
+      .mockResolvedValueOnce({ slug: 'org-1' }) // orgSlugFromId → betterAuth findOne
       .mockResolvedValueOnce([]); // lookupVideoLinkSources
     const ctx = createMockCtx({ runQuery });
 
@@ -388,6 +400,7 @@ describe('retrieveDocument helper', () => {
         title: 'Test',
       }) // findDocumentByFileId
       .mockResolvedValueOnce(['doc-slashes']) // getAccessibleDocumentIds
+      .mockResolvedValueOnce({ slug: 'org-1' }) // orgSlugFromId → betterAuth findOne
       .mockResolvedValueOnce([]); // lookupVideoLinkSources
     const ctx = createMockCtx({ runQuery });
 
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
index 632c1513a3..1ba8364b94 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_comparison.test.ts
@@ -20,6 +20,7 @@ beforeAll(() => {
 });
 const BASE_FILE_ID = 'file-base-123';
 const COMP_FILE_ID = 'file-comp-456';
+const ORG_SLUG = 'test-org';
 
 const originalFetch = globalThis.fetch;
 
@@ -82,7 +83,11 @@ afterEach(() => {
 
 describe('fetchDocumentComparison', () => {
   it('returns correctly mapped result on happy path', async () => {
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.baseDocument).toEqual({
       fileId: BASE_FILE_ID,
@@ -105,7 +110,11 @@ describe('fetchDocumentComparison', () => {
   });
 
   it('maps change blocks with all diff item fields', async () => {
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.changeBlocks).toHaveLength(1);
     const block = result.changeBlocks[0];
@@ -144,7 +153,11 @@ describe('fetchDocumentComparison', () => {
       }),
     );
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     const item = result.changeBlocks[0].items[0];
     expect(item.inlineDiff).toBeNull();
@@ -154,7 +167,7 @@ describe('fetchDocumentComparison', () => {
   });
 
   it('sends POST request with correct body', async () => {
-    await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    await fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID);
 
     // ragFetch wraps init.headers in a `new Headers(...)` and adds the
     // bearer token + redirect:'manual' + AbortSignal — so we assert on
@@ -177,7 +190,7 @@ describe('fetchDocumentComparison', () => {
   });
 
   it('includes max_changes in body when provided', async () => {
-    await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID, 50);
+    await fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID, 50);
 
     expect(globalThis.fetch).toHaveBeenCalledWith(
       expect.anything(),
@@ -192,7 +205,7 @@ describe('fetchDocumentComparison', () => {
   });
 
   it('omits max_changes from body when not provided', async () => {
-    await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    await fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID);
 
     expect(globalThis.fetch).toHaveBeenCalledWith(
       expect.anything(),
@@ -216,7 +229,7 @@ describe('fetchDocumentComparison', () => {
     // safeMessageFor maps 404 to a "returned not found" summary; the
     // upstream body lives only on `.bodySnippet`.
     await expect(
-      fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
+      fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID),
     ).rejects.toThrow(/not found/);
   });
 
@@ -231,7 +244,7 @@ describe('fetchDocumentComparison', () => {
     );
 
     await expect(
-      fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
+      fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID),
     ).rejects.toThrow(/HTTP 400/);
   });
 
@@ -246,7 +259,7 @@ describe('fetchDocumentComparison', () => {
     );
 
     await expect(
-      fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
+      fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID),
     ).rejects.toThrow(/HTTP 500/);
   });
 
@@ -261,7 +274,7 @@ describe('fetchDocumentComparison', () => {
     );
 
     await expect(
-      fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
+      fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID),
     ).rejects.toThrow('timed out after 120s');
   });
 
@@ -272,7 +285,7 @@ describe('fetchDocumentComparison', () => {
     );
 
     await expect(
-      fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID),
+      fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID),
     ).rejects.toThrow('Failed to fetch');
   });
 
@@ -292,7 +305,11 @@ describe('fetchDocumentComparison', () => {
       }),
     );
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.changeBlocks).toEqual([]);
     expect(result.stats.unchanged).toBe(5);
@@ -301,7 +318,11 @@ describe('fetchDocumentComparison', () => {
   it('handles truncated response', async () => {
     mockFetchSuccess(createRagCompareResponse({ truncated: true }));
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.truncated).toBe(true);
   });
@@ -321,7 +342,11 @@ describe('fetchDocumentComparison', () => {
       }),
     );
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.stats.highDivergence).toBe(true);
   });
@@ -334,7 +359,11 @@ describe('fetchDocumentComparison', () => {
       }),
     );
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.baseDocument.title).toBeNull();
     expect(result.comparisonDocument.title).toBeNull();
@@ -372,7 +401,11 @@ describe('fetchDocumentComparison', () => {
       }),
     );
 
-    const result = await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    const result = await fetchDocumentComparison(
+      ORG_SLUG,
+      BASE_FILE_ID,
+      COMP_FILE_ID,
+    );
 
     expect(result.changeBlocks).toHaveLength(2);
     expect(result.changeBlocks[0].items[0].type).toBe('deleted');
@@ -380,7 +413,7 @@ describe('fetchDocumentComparison', () => {
   });
 
   it('passes AbortSignal to fetch', async () => {
-    await fetchDocumentComparison(BASE_FILE_ID, COMP_FILE_ID);
+    await fetchDocumentComparison(ORG_SLUG, BASE_FILE_ID, COMP_FILE_ID);
 
     const fetchCall = vi.mocked(globalThis.fetch).mock.calls[0];
     const options = fetchCall?.[1];
diff --git a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
index 5e9475c278..127ca48bb2 100644
--- a/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
+++ b/services/platform/convex/agent_tools/documents/fetch_document_content.test.ts
@@ -19,6 +19,7 @@ beforeAll(() => {
   _resetRagConfigForTests();
 });
 const FILE_ID = 'file-storage-123';
+const ORG_SLUG = 'test-org';
 
 const originalFetch = globalThis.fetch;
 
@@ -56,7 +57,7 @@ afterEach(() => {
 
 describe('fetchDocumentContent', () => {
   it('returns correct result shape on happy path', async () => {
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result).toEqual({
       fileId: FILE_ID,
@@ -70,7 +71,7 @@ describe('fetchDocumentContent', () => {
   });
 
   it('builds URL without query params when no options provided', async () => {
-    await fetchDocumentContent(FILE_ID);
+    await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     const fetchCall = vi.mocked(globalThis.fetch).mock.calls[0];
     const url = fetchCall?.[0] ?? '';
@@ -78,7 +79,7 @@ describe('fetchDocumentContent', () => {
   });
 
   it('appends chunk_start and chunk_end query params', async () => {
-    await fetchDocumentContent(FILE_ID, {
+    await fetchDocumentContent(ORG_SLUG, FILE_ID, {
       chunkStart: 3,
       chunkEnd: 8,
     });
@@ -99,7 +100,7 @@ describe('fetchDocumentContent', () => {
       }),
     );
 
-    const result = await fetchDocumentContent(FILE_ID, {
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID, {
       returnChunks: true,
     });
 
@@ -111,7 +112,7 @@ describe('fetchDocumentContent', () => {
   });
 
   it('omits return_chunks param when not set', async () => {
-    await fetchDocumentContent(FILE_ID);
+    await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     const fetchCall = vi.mocked(globalThis.fetch).mock.calls[0];
     const url = fetchCall?.[0] ?? '';
@@ -119,7 +120,7 @@ describe('fetchDocumentContent', () => {
   });
 
   it('encodes fileId in URL', async () => {
-    await fetchDocumentContent('file/with spaces');
+    await fetchDocumentContent(ORG_SLUG, 'file/with spaces');
 
     const fetchCall = vi.mocked(globalThis.fetch).mock.calls[0];
     const url = fetchCall?.[0] ?? '';
@@ -132,7 +133,7 @@ describe('fetchDocumentContent', () => {
       createRagResponse({ content: longContent, total_chars: 60_000 }),
     );
 
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result.truncated).toBe(true);
     expect(result.content).toHaveLength(50_000);
@@ -145,7 +146,7 @@ describe('fetchDocumentContent', () => {
       createRagResponse({ content: exactContent, total_chars: 50_000 }),
     );
 
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result.truncated).toBe(false);
     expect(result.content).toHaveLength(50_000);
@@ -154,7 +155,7 @@ describe('fetchDocumentContent', () => {
   it('handles empty content', async () => {
     mockFetchSuccess(createRagResponse({ content: '', total_chars: 0 }));
 
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result.content).toBe('');
     expect(result.truncated).toBe(false);
@@ -164,7 +165,7 @@ describe('fetchDocumentContent', () => {
   it('handles null content as empty string', async () => {
     mockFetchSuccess(createRagResponse({ content: null, total_chars: 0 }));
 
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result.content).toBe('');
     expect(result.truncated).toBe(false);
@@ -173,7 +174,7 @@ describe('fetchDocumentContent', () => {
   it('returns "Untitled" when RAG title is null', async () => {
     mockFetchSuccess(createRagResponse({ title: null }));
 
-    const result = await fetchDocumentContent(FILE_ID);
+    const result = await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     expect(result.name).toBe('Untitled');
   });
@@ -184,7 +185,7 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(
+    await expect(fetchDocumentContent(ORG_SLUG, FILE_ID)).rejects.toThrow(
       'was not found in the knowledge base',
     );
   });
@@ -199,7 +200,9 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(/HTTP 500/);
+    await expect(fetchDocumentContent(ORG_SLUG, FILE_ID)).rejects.toThrow(
+      /HTTP 500/,
+    );
   });
 
   it('throws an UpstreamHttpError shaped from the response on non-ok', async () => {
@@ -210,7 +213,7 @@ describe('fetchDocumentContent', () => {
 
     // `.message` carries the safe summary (status + endpoint); the
     // raw "Rate limited" body now lives only on `.bodySnippet`.
-    const err = await fetchDocumentContent(FILE_ID).then(
+    const err = await fetchDocumentContent(ORG_SLUG, FILE_ID).then(
       () => null,
       (e: unknown) => e,
     );
@@ -232,7 +235,7 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(
+    await expect(fetchDocumentContent(ORG_SLUG, FILE_ID)).rejects.toThrow(
       'Failed to parse RAG response',
     );
   });
@@ -247,7 +250,7 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(
+    await expect(fetchDocumentContent(ORG_SLUG, FILE_ID)).rejects.toThrow(
       'timed out after 60s',
     );
   });
@@ -258,13 +261,13 @@ describe('fetchDocumentContent', () => {
       { preconnect: vi.fn() },
     );
 
-    await expect(fetchDocumentContent(FILE_ID)).rejects.toThrow(
+    await expect(fetchDocumentContent(ORG_SLUG, FILE_ID)).rejects.toThrow(
       'Failed to fetch',
     );
   });
 
   it('passes AbortSignal to fetch', async () => {
-    await fetchDocumentContent(FILE_ID);
+    await fetchDocumentContent(ORG_SLUG, FILE_ID);
 
     const fetchCall = vi.mocked(globalThis.fetch).mock.calls[0];
     const options = fetchCall?.[1];
@@ -279,7 +282,7 @@ describe('fetchDocumentContent', () => {
       }),
     );
 
-    await fetchDocumentContent(FILE_ID, {
+    await fetchDocumentContent(ORG_SLUG, FILE_ID, {
       chunkStart: 5,
       returnChunks: true,
     });
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
index e62e18fdd1..a7ef3303e9 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_comparison.ts
@@ -108,8 +108,12 @@ function mapChangeBlock(block: RagChangeBlock): ChangeBlock {
 
 /**
  * Compare two documents by ID via the RAG service's deterministic diff endpoint.
+ *
+ * Both file_ids must belong to `orgSlug`. RAG now scopes documents by
+ * org_slug — a foreign-org file_id returns 404 (not the foreign content).
  */
 export async function fetchDocumentComparison(
+  orgSlug: string,
   baseFileId: string,
   comparisonFileId: string,
   maxChanges?: number,
@@ -128,6 +132,7 @@ export async function fetchDocumentComparison(
       headers: { 'Content-Type': 'application/json' },
       body: JSON.stringify(body),
       timeoutMs: FETCH_TIMEOUT_MS,
+      orgSlug,
     });
 
     // All non-2xx paths now route through UpstreamHttpError so the
diff --git a/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts b/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
index 5e2e50f47d..6cb19c230b 100644
--- a/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/fetch_document_content.ts
@@ -33,10 +33,15 @@ export interface FetchDocumentContentOptions {
 }
 
 /**
- * Fetch document content from the RAG service.
+ * Fetch document content from the RAG service, scoped to `orgSlug`.
  * Shared between agent tool (retrieve_document) and workflow action (document action).
+ *
+ * RAG now scopes documents by `org_slug`; a foreign-org `fileId` returns
+ * 404 (not the foreign content) which surfaces here as the documented
+ * "not found in the knowledge base" error.
  */
 export async function fetchDocumentContent(
+  orgSlug: string,
   fileId: string,
   options?: FetchDocumentContentOptions,
 ): Promise<DocumentContentResult> {
@@ -54,7 +59,10 @@ export async function fetchDocumentContent(
   const path = `/api/v1/documents/${encodeURIComponent(fileId)}/content${query ? `?${query}` : ''}`;
 
   try {
-    const response = await ragFetch(path, { timeoutMs: FETCH_TIMEOUT_MS });
+    const response = await ragFetch(path, {
+      timeoutMs: FETCH_TIMEOUT_MS,
+      orgSlug,
+    });
 
     if (response.status === 404) {
       throw new Error(
diff --git a/services/platform/convex/agent_tools/documents/helpers/retrieve_document.ts b/services/platform/convex/agent_tools/documents/helpers/retrieve_document.ts
index f1b37226ce..5fc34980f1 100644
--- a/services/platform/convex/agent_tools/documents/helpers/retrieve_document.ts
+++ b/services/platform/convex/agent_tools/documents/helpers/retrieve_document.ts
@@ -3,6 +3,7 @@ import type { z } from 'zod/v4';
 
 import { internal } from '../../../_generated/api';
 import { createDebugLog } from '../../../lib/debug_log';
+import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { toId } from '../../../lib/type_cast_helpers';
 import { wrapUntrusted } from '../../../lib/untrusted_content';
 import type { documentRetrieveArgs } from '../document_retrieve_tool';
@@ -89,7 +90,8 @@ export async function retrieveDocument(
     }
   }
 
-  const result = await fetchDocumentContent(args.fileId, {
+  const orgSlug = await orgSlugFromId(ctx, organizationId);
+  const result = await fetchDocumentContent(orgSlug, args.fileId, {
     chunkStart: args.chunkStart,
     chunkEnd: args.chunkEnd,
   });
diff --git a/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts b/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts
index e1e4c1e58d..e0c5e45a4b 100644
--- a/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts
+++ b/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts
@@ -23,6 +23,7 @@ export interface DocumentChunksResult {
 }
 
 export async function fetchDocumentChunks(
+  orgSlug: string,
   fileId: string,
 ): Promise<DocumentChunksResult> {
   const allChunks: Array<{ index: number; content: string }> = [];
@@ -42,7 +43,7 @@ export async function fetchDocumentChunks(
       // Default ragFetch timeout is 10s; sibling RAG ops in
       // workflow_engine use 30–120s. Matching that here so chunk
       // pagination doesn't fail mid-scan on a slow embedding tail.
-      { timeoutMs: 60_000 },
+      { timeoutMs: 60_000, orgSlug },
     );
 
     if (!response.ok) {
diff --git a/services/platform/convex/agents/internal_actions.ts b/services/platform/convex/agents/internal_actions.ts
index de5c9f5d06..a3ce9473af 100644
--- a/services/platform/convex/agents/internal_actions.ts
+++ b/services/platform/convex/agents/internal_actions.ts
@@ -9,6 +9,7 @@ import { internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
 import { getPollingInterval } from '../documents/internal_actions';
 import { readJsonFile } from '../lib/file_io';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { deleteDocumentById } from '../workflow_engine/action_defs/rag/helpers/delete_document';
 import { uploadDocument } from '../workflow_engine/action_defs/rag/helpers/upload_document';
@@ -94,11 +95,13 @@ export const checkKnowledgeFileStatus = internalAction({
     }
 
     try {
+      const orgSlug = await orgSlugFromId(ctx, args.organizationId);
       const response = await ragFetch('/api/v1/documents/statuses', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify({ file_ids: [String(args.fileId)] }),
         timeoutMs: 10_000,
+        orgSlug,
       });
 
       if (response.status === 429 || !response.ok) {
@@ -230,12 +233,15 @@ export const checkKnowledgeFileStatus = internalAction({
 
 export const deleteKnowledgeFileFromRag = internalAction({
   args: {
+    organizationId: v.string(),
     fileId: v.id('_storage'),
   },
   returns: v.null(),
-  handler: async (_ctx, args): Promise<null> => {
+  handler: async (ctx, args): Promise<null> => {
     try {
+      const orgSlug = await orgSlugFromId(ctx, args.organizationId);
       await deleteDocumentById({
+        orgSlug,
         fileId: String(args.fileId),
       });
     } catch (error) {
diff --git a/services/platform/convex/agents/mutations.ts b/services/platform/convex/agents/mutations.ts
index d7cba9b65a..4fc514e8a7 100644
--- a/services/platform/convex/agents/mutations.ts
+++ b/services/platform/convex/agents/mutations.ts
@@ -262,7 +262,7 @@ export const removeKnowledgeFile = mutation({
     await ctx.scheduler.runAfter(
       0,
       internal.agents.internal_actions.deleteKnowledgeFileFromRag,
-      { fileId: args.fileId },
+      { organizationId: args.organizationId, fileId: args.fileId },
     );
     await ctx.storage.delete(args.fileId);
 
@@ -298,7 +298,7 @@ export const cleanupAgentBinding = internalMutation({
       await ctx.scheduler.runAfter(
         0,
         internal.agents.internal_actions.deleteKnowledgeFileFromRag,
-        { fileId: file.fileId },
+        { organizationId: args.organizationId, fileId: file.fileId },
       );
       await ctx.storage.delete(file.fileId);
 
diff --git a/services/platform/convex/documents/internal_actions.ts b/services/platform/convex/documents/internal_actions.ts
index 0fd3883670..8d03ea6895 100644
--- a/services/platform/convex/documents/internal_actions.ts
+++ b/services/platform/convex/documents/internal_actions.ts
@@ -7,6 +7,7 @@ import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import { internalAction } from '../_generated/server';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { buildDownloadUrl } from '../lib/helpers/public_storage_url';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
@@ -194,6 +195,7 @@ export const checkRagDocumentStatus = internalAction({
     }
 
     try {
+      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
       const response = await ragFetch('/api/v1/documents/statuses', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
@@ -201,6 +203,7 @@ export const checkRagDocumentStatus = internalAction({
           file_ids: [document.fileId],
         }),
         timeoutMs: 10_000,
+        orgSlug,
       });
 
       if (response.status === 429) {
@@ -401,9 +404,10 @@ export const deleteDocumentFromRag = internalAction({
 
     let ragSuccess = false;
     try {
+      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
       const response = await ragFetch(
         `/api/v1/documents/${encodeURIComponent(ragKey)}`,
-        { method: 'DELETE', timeoutMs: 60_000 },
+        { method: 'DELETE', timeoutMs: 60_000, orgSlug },
       );
 
       if (response.ok) {
@@ -536,11 +540,22 @@ export const reindexDocumentInRag = internalAction({
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
+    // Look up current document first so we can scope the delete by org.
+    const document = await ctx.runQuery(
+      internal.documents.internal_queries.getDocumentByIdRaw,
+      { documentId: args.documentId },
+    );
+
+    if (!document || !document.fileId) {
+      return null;
+    }
+
     // Delete old RAG entry (ignore 404 — may not have been indexed)
     try {
+      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
       const response = await ragFetch(
         `/api/v1/documents/${encodeURIComponent(args.oldFileId)}`,
-        { method: 'DELETE', timeoutMs: 60_000 },
+        { method: 'DELETE', timeoutMs: 60_000, orgSlug },
       );
       if (!response.ok && response.status !== 404) {
         console.warn(
@@ -554,16 +569,6 @@ export const reindexDocumentInRag = internalAction({
       );
     }
 
-    // Look up current document
-    const document = await ctx.runQuery(
-      internal.documents.internal_queries.getDocumentByIdRaw,
-      { documentId: args.documentId },
-    );
-
-    if (!document || !document.fileId) {
-      return null;
-    }
-
     // Upload new file to RAG
     try {
       const rawResult = await ragAction.execute(
diff --git a/services/platform/convex/file_metadata/actions.ts b/services/platform/convex/file_metadata/actions.ts
index 9f16240bfb..5116a55a0e 100644
--- a/services/platform/convex/file_metadata/actions.ts
+++ b/services/platform/convex/file_metadata/actions.ts
@@ -6,6 +6,7 @@ import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { action } from '../_generated/server';
 import { authComponent } from '../auth';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 
 /**
@@ -37,46 +38,33 @@ export const checkFileRagStatuses = action({
     }
     const callerId = String(authUser._id);
 
-    // Filter storageIds down to ones the caller is authorized to see.
-    // Per-row org membership check (stored on fileMetadata).
-    const allowedStorageIds = await ctx.runQuery(
+    // Filter storageIds down to ones the caller is authorized to see, and
+    // get the org for each so we can call RAG (which is now per-org) with
+    // the correct X-Tale-Org header per group.
+    const allowed = await ctx.runQuery(
       internal.file_metadata.internal_queries.filterStorageIdsByCallerOrg,
       { storageIds: args.storageIds, userId: callerId },
     );
-    if (allowedStorageIds.length === 0) {
+    if (allowed.length === 0) {
       console.warn(
         '[checkFileRagStatuses] no authorized storage ids for caller — refused',
       );
       return null;
     }
-    args = { ...args, storageIds: allowedStorageIds };
 
-    let body: unknown;
-    try {
-      const response = await ragFetch('/api/v1/documents/statuses', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ file_ids: args.storageIds }),
-        timeoutMs: 10_000,
-      });
-
-      if (!response.ok) {
-        console.warn(`[checkFileRagStatuses] RAG returned ${response.status}`);
-        return null;
-      }
-
-      body = await response.json();
-    } catch (error) {
-      console.warn('[checkFileRagStatuses] Failed to fetch statuses:', error);
-      return null;
-    }
-
-    if (!isRecord(body) || !isRecord(body.statuses)) {
-      return null;
+    // Group authorized storage ids by org so we can issue one RAG call
+    // per distinct org. The cache means each org slug is resolved once
+    // even when many files belong to the same org.
+    const orgIdsToFiles = new Map<
+      string,
+      Array<(typeof args.storageIds)[number]>
+    >();
+    for (const { storageId, organizationId } of allowed) {
+      const bucket = orgIdsToFiles.get(organizationId);
+      if (bucket) bucket.push(storageId);
+      else orgIdsToFiles.set(organizationId, [storageId]);
     }
 
-    const statuses = body.statuses;
-
     // Give RAG 90s to have ingested a newly-queued upload. If we're still
     // getting null after that window, the upload never reached RAG (likely
     // the scheduled action was dropped before it ran) — mark failed so the
@@ -84,7 +72,38 @@ export const checkFileRagStatuses = action({
     // the fileMetadata row, so re-queues reset the clock.
     const STALE_QUEUE_MS = 90_000;
 
-    for (const storageId of args.storageIds) {
+    const mergedStatuses: Record<string, unknown> = {};
+    for (const [organizationId, storageIds] of orgIdsToFiles) {
+      const orgSlug = await orgSlugFromId(ctx, organizationId);
+      try {
+        const response = await ragFetch('/api/v1/documents/statuses', {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ file_ids: storageIds }),
+          timeoutMs: 10_000,
+          orgSlug,
+        });
+        if (!response.ok) {
+          console.warn(
+            `[checkFileRagStatuses] RAG returned ${response.status} for org ${orgSlug}`,
+          );
+          continue;
+        }
+        const body: unknown = await response.json();
+        if (!isRecord(body) || !isRecord(body.statuses)) continue;
+        Object.assign(mergedStatuses, body.statuses);
+      } catch (error) {
+        console.warn(
+          `[checkFileRagStatuses] Failed to fetch statuses for org ${orgSlug}:`,
+          error,
+        );
+      }
+    }
+
+    const statuses = mergedStatuses;
+    const allAuthorizedStorageIds = allowed.map((a) => a.storageId);
+
+    for (const storageId of allAuthorizedStorageIds) {
       const docStatus = statuses[storageId];
       if (!isRecord(docStatus)) {
         await ctx.runMutation(
diff --git a/services/platform/convex/file_metadata/internal_queries.ts b/services/platform/convex/file_metadata/internal_queries.ts
index b93c9a6ec6..26aaba2bf3 100644
--- a/services/platform/convex/file_metadata/internal_queries.ts
+++ b/services/platform/convex/file_metadata/internal_queries.ts
@@ -77,9 +77,20 @@ export const filterStorageIdsByCallerOrg = internalQuery({
     storageIds: v.array(v.id('_storage')),
     userId: v.string(),
   },
-  returns: v.array(v.id('_storage')),
+  // Returns one entry per authorized storage id with its organizationId so
+  // callers can group by org (e.g., RAG endpoints are now org-scoped and
+  // accept one org_slug per request).
+  returns: v.array(
+    v.object({
+      storageId: v.id('_storage'),
+      organizationId: v.string(),
+    }),
+  ),
   async handler(ctx, args) {
-    const allowed: Array<(typeof args.storageIds)[number]> = [];
+    const allowed: Array<{
+      storageId: (typeof args.storageIds)[number];
+      organizationId: string;
+    }> = [];
     const orgMembershipCache = new Map<string, boolean>();
     for (const storageId of args.storageIds) {
       const meta = await ctx.db
@@ -104,7 +115,7 @@ export const filterStorageIdsByCallerOrg = internalQuery({
         isMember = (result?.page?.length ?? 0) > 0;
         orgMembershipCache.set(orgId, isMember);
       }
-      if (isMember) allowed.push(storageId);
+      if (isMember) allowed.push({ storageId, organizationId: orgId });
     }
     return allowed;
   },
diff --git a/services/platform/convex/governance/erasure.ts b/services/platform/convex/governance/erasure.ts
index 5cb15e4c05..7a8dff8637 100644
--- a/services/platform/convex/governance/erasure.ts
+++ b/services/platform/convex/governance/erasure.ts
@@ -61,6 +61,7 @@ import {
 import * as ApprovalsHelpers from '../approvals/helpers';
 import { createAuditLog } from '../audit_logs/helpers';
 import { authComponent } from '../auth';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { hashEmailForAudit } from '../lib/helpers/pii_hash';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { rateLimiter } from '../lib/rate_limiter';
@@ -1727,11 +1728,14 @@ export const processErasureRequest = internalAction({
       );
       documentsErased = docResult.rows;
       documentsSkippedByHold = docResult.skippedByHold;
+      // RAG is per-org; resolve once and reuse for all per-file DELETEs in
+      // this erasure pass (subject is bound to a single organizationId).
+      const ragOrgSlug = await orgSlugFromId(ctx, state.organizationId);
       for (const fileId of docResult.fileIds) {
         try {
           const res = await ragFetch(
             `/api/v1/documents/${encodeURIComponent(fileId)}`,
-            { method: 'DELETE', timeoutMs: 10_000 },
+            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
           );
           if (res.ok || res.status === 404) {
             ragDocumentsRemoved += 1;
@@ -1792,7 +1796,7 @@ export const processErasureRequest = internalAction({
         try {
           const res = await ragFetch(
             `/api/v1/documents/${encodeURIComponent(storageId)}`,
-            { method: 'DELETE', timeoutMs: 10_000 },
+            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
           );
           if (res.ok || res.status === 404) {
             ragDocumentsRemoved += 1;
@@ -1826,7 +1830,7 @@ export const processErasureRequest = internalAction({
         try {
           const res = await ragFetch(
             `/api/v1/documents/${encodeURIComponent(storageId)}`,
-            { method: 'DELETE', timeoutMs: 10_000 },
+            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
           );
           if (res.ok || res.status === 404) {
             ragDocumentsRemoved += 1;
diff --git a/services/platform/convex/governance/retention_cleanup.ts b/services/platform/convex/governance/retention_cleanup.ts
index 268a00f88b..cc17caaa3a 100644
--- a/services/platform/convex/governance/retention_cleanup.ts
+++ b/services/platform/convex/governance/retention_cleanup.ts
@@ -9,6 +9,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { internalAction } from '../_generated/server';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 import type { ActiveHolds } from './legal_hold';
 import {
@@ -105,11 +106,15 @@ interface OrgPolicy {
   config: RetentionPolicyConfig;
 }
 
-async function deleteRagEntry(fileId: string, label: string): Promise<void> {
+async function deleteRagEntry(
+  orgSlug: string,
+  fileId: string,
+  label: string,
+): Promise<void> {
   try {
     const res = await ragFetch(
       `/api/v1/documents/${encodeURIComponent(fileId)}`,
-      { method: 'DELETE', timeoutMs: 10_000 },
+      { method: 'DELETE', timeoutMs: 10_000, orgSlug },
     );
     // 404 is success on DELETE — already gone.
     if (!res.ok && res.status !== 404) {
@@ -194,6 +199,8 @@ async function cleanupDocuments(
           { organizationId: org.organizationId, cutoffMs, batchSize },
         );
 
+  const orgSlug = await orgSlugFromId(ctx, org.organizationId);
+
   for (const doc of passB) {
     if (doc.createdBy && holds.userMembershipIds.has(doc.createdBy)) {
       console.info(
@@ -203,7 +210,7 @@ async function cleanupDocuments(
     }
 
     if (doc.fileId) {
-      await deleteRagEntry(doc.fileId, `document ${doc._id}`);
+      await deleteRagEntry(orgSlug, doc.fileId, `document ${doc._id}`);
     }
 
     await ctx.runMutation(
@@ -302,6 +309,8 @@ async function cleanupTempFiles(
           { organizationId: org.organizationId, source, cutoffMs, batchSize },
         );
 
+  const tempOrgSlug = await orgSlugFromId(ctx, org.organizationId);
+
   for (const file of passB) {
     if (file.uploadedBy && holds.userMembershipIds.has(file.uploadedBy)) {
       console.info(
@@ -310,7 +319,7 @@ async function cleanupTempFiles(
       continue;
     }
 
-    await deleteRagEntry(file.storageId, `temp file ${file._id}`);
+    await deleteRagEntry(tempOrgSlug, file.storageId, `temp file ${file._id}`);
 
     await ctx.runMutation(
       internal.governance.internal_mutations_retention.deleteExpiredTempFile,
diff --git a/services/platform/convex/threads/cascade_helpers.ts b/services/platform/convex/threads/cascade_helpers.ts
index 3c06f52262..101c4ae731 100644
--- a/services/platform/convex/threads/cascade_helpers.ts
+++ b/services/platform/convex/threads/cascade_helpers.ts
@@ -33,6 +33,7 @@ import type { MutationCtx } from '../_generated/server';
 import { createAuditLog } from '../audit_logs/helpers';
 import type { ActiveHolds } from '../governance/legal_hold';
 import { loadActiveHolds } from '../governance/legal_hold';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { parseSubThreadIds } from './delete_chat_thread';
 
 // Audit actions emitted by this file. Keep grep-able:
@@ -331,11 +332,15 @@ export async function cascadeDeleteThreadChildren(
       await ctx.db.delete(fileMeta._id);
     }
     if (ragPurgeStorageIds.length > 0) {
+      // `organizationId` is guaranteed truthy at this point (outer
+      // `if (organizationId)` branch). Resolve to slug so RAG's per-org
+      // delete scope targets the correct tenant's chunks.
+      const orgSlug = await orgSlugFromId(ctx, organizationId);
       await ctx.scheduler.runAfter(
         0,
         internal.workflow_engine.action_defs.rag.helpers.delete_document
           .deleteFromRagBatch,
-        { fileIds: ragPurgeStorageIds },
+        { orgSlug, fileIds: ragPurgeStorageIds },
       );
     }
     if (filesPage.length === PAGE_SIZE) {
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index 5bdcf38c3b..1b03bcc2ef 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -295,11 +295,16 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
             `Document with file ID "${params.fileId}" not found in this organization`,
           );
         }
-        const result = await fetchDocumentContent(params.fileId, {
-          chunkStart: params.chunkStart,
-          chunkEnd: params.chunkEnd,
-          returnChunks: params.returnChunks,
-        });
+        const retrieveOrgSlug = await orgSlugFromId(ctx, organizationId);
+        const result = await fetchDocumentContent(
+          retrieveOrgSlug,
+          params.fileId,
+          {
+            chunkStart: params.chunkStart,
+            chunkEnd: params.chunkEnd,
+            returnChunks: params.returnChunks,
+          },
+        );
         // Prompt-injection defense for the workflow path. The
         // agent-tool sibling `retrieveDocument` already wraps video-
         // link-sourced content in `<untrusted_source>`; the workflow
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
index e588411693..fdcf6c81e5 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.test.ts
@@ -53,6 +53,7 @@ describe('deleteDocumentById', () => {
     });
 
     await deleteDocumentById({
+      orgSlug: 'test-org',
       fileId: 'doc-123',
     });
 
@@ -70,6 +71,7 @@ describe('deleteDocumentById', () => {
     });
 
     const result = await deleteDocumentById({
+      orgSlug: 'test-org',
       fileId: 'doc-abc',
     });
 
@@ -85,6 +87,7 @@ describe('deleteDocumentById', () => {
     mockFetch({ detail: 'bad request' }, 400);
 
     const result = await deleteDocumentById({
+      orgSlug: 'test-org',
       fileId: 'doc-fail',
     });
 
@@ -98,7 +101,7 @@ describe('deleteDocumentById', () => {
     mockFetch({ detail: 'service error' }, 500);
 
     await expect(
-      deleteDocumentById({ fileId: 'doc-fail-5xx' }),
+      deleteDocumentById({ orgSlug: 'test-org', fileId: 'doc-fail-5xx' }),
     ).rejects.toThrow(/HTTP 500|unavailable/);
   });
 
@@ -112,6 +115,7 @@ describe('deleteDocumentById', () => {
     mockFetch({ detail: 'not found' }, 404);
 
     const result = await deleteDocumentById({
+      orgSlug: 'test-org',
       fileId: 'doc-already-gone',
     });
 
@@ -125,6 +129,7 @@ describe('deleteDocumentById', () => {
     mockFetch({ success: true, deleted_count: 0, deleted_data_ids: [] });
 
     await deleteDocumentById({
+      orgSlug: 'test-org',
       fileId: 'doc/with spaces',
     });
 
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
index b0b00fc6b9..7941dd627c 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/helpers/delete_document.ts
@@ -16,18 +16,20 @@ import { ragFetch } from '../../../../lib/helpers/rag_config';
 import type { RagDeleteResult } from './types';
 
 export interface DeleteDocumentByIdArgs {
+  orgSlug: string;
   fileId: string;
   timeoutMs?: number;
 }
 
 /**
- * Delete document from RAG service by document ID.
+ * Delete document from RAG service by document ID, scoped to `orgSlug`.
  *
- * This calls the RAG service's DELETE endpoint with the document ID.
- * The document ID should match the ID that was used when uploading
- * the document (recordId from the platform).
+ * RAG now scopes documents by `org_slug`, so the caller's org must be
+ * passed through. A foreign-org `fileId` returns 0 deletions rather than
+ * touching another tenant's data.
  */
 export async function deleteDocumentById({
+  orgSlug,
   fileId,
   timeoutMs = 60000,
 }: DeleteDocumentByIdArgs): Promise<RagDeleteResult> {
@@ -36,7 +38,7 @@ export async function deleteDocumentById({
   try {
     const response = await ragFetch(
       `/api/v1/documents/${encodeURIComponent(fileId)}`,
-      { method: 'DELETE', timeoutMs },
+      { method: 'DELETE', timeoutMs, orgSlug },
     );
 
     // Round-2 review HIGH: 404 means the document was already deleted
@@ -116,9 +118,13 @@ export async function deleteDocumentById({
  * action with the storageIds of the chat-upload files they removed.
  * Best-effort: failures per file log but do not abort the batch.
  * Round-2 review CRITICAL #17.
+ *
+ * Now per-tenant: `orgSlug` is required so the per-org RAG namespace is
+ * targeted. All `fileIds` in a single call MUST belong to that org.
  */
 export const deleteFromRagBatch = internalAction({
   args: {
+    orgSlug: v.string(),
     fileIds: v.array(v.string()),
   },
   returns: v.null(),
@@ -129,7 +135,10 @@ export const deleteFromRagBatch = internalAction({
       // should not abort cleanup of the other ids — log + move on so
       // the next retention sweep gets to retry.
       try {
-        const result = await deleteDocumentById({ fileId });
+        const result = await deleteDocumentById({
+          orgSlug: args.orgSlug,
+          fileId,
+        });
         if (!result.success) {
           console.warn(
             `[deleteFromRagBatch] delete failed for ${fileId}:`,
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index cdf0621d81..781c85f580 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -64,22 +64,31 @@ export const ragAction: ActionDefinition<RagActionParams> = {
         return { ...result, executionTimeMs: Date.now() - startTime };
       }
       case 'delete_document': {
-        // Cross-tenant gate: file_id is global in RAG; verify the workflow's
-        // org owns the storage row before forwarding the delete.
-        await assertStorageIdsInOrg(ctx, _variables, [migratedParams.fileId]);
+        // Cross-tenant gate: even though RAG now scopes DELETE by org_slug,
+        // verify the workflow's org owns the storage row first so a foreign
+        // file_id surfaces as the documented error (not silently 0 deletes).
+        const orgId = await assertStorageIdsInOrg(ctx, _variables, [
+          migratedParams.fileId,
+        ]);
+        const orgSlug = await orgSlugFromId(ctx, orgId);
         const result = await deleteDocumentById({
+          orgSlug,
           fileId: migratedParams.fileId,
         });
         return { ...result, executionTimeMs: Date.now() - startTime };
       }
       case 'get_chunks': {
-        // Cross-tenant gate: workflow params can carry caller-controlled
-        // file ids from upstream steps, and the RAG service has no per-org
-        // namespace — file_id is global. Mirror the `compare` branch in
-        // document_action.ts:333-354 by verifying the storage id belongs
-        // to the workflow's org before forwarding to RAG.
-        await assertStorageIdsInOrg(ctx, _variables, [migratedParams.fileId]);
-        const result = await fetchDocumentChunks(migratedParams.fileId);
+        // Cross-tenant gate: even with RAG's data-layer org_slug filter,
+        // verify the storage id belongs to the workflow's org so a foreign
+        // file_id surfaces as the documented error (not a confusing 404).
+        const orgId = await assertStorageIdsInOrg(ctx, _variables, [
+          migratedParams.fileId,
+        ]);
+        const orgSlug = await orgSlugFromId(ctx, orgId);
+        const result = await fetchDocumentChunks(
+          orgSlug,
+          migratedParams.fileId,
+        );
         // Prompt-injection defense: video-link-sourced chunks contain
         // attacker-controlled transcript text. Mirror the wrap that
         // `rag_search_tool.ts` applies on the agent-tool side.
diff --git a/services/rag/app/routers/documents.py b/services/rag/app/routers/documents.py
index a59b791ef1..d4a9f04b9f 100644
--- a/services/rag/app/routers/documents.py
+++ b/services/rag/app/routers/documents.py
@@ -132,43 +132,47 @@
 
 
 async def _insert_processing_row(
+    org_slug: str,
     file_id: str,
     filename: str,
 ) -> None:
-    """Insert a processing status row at ingestion start."""
+    """Insert a processing status row at ingestion start (scoped to org)."""
     pool = await get_pool()
     async with acquire_with_retry(pool) as conn:
         await conn.execute(
             f"""
-            INSERT INTO {SCHEMA}.documents (file_id, filename, status)
-            VALUES ($1, $2, 'processing')
-            ON CONFLICT (file_id, COALESCE(team_id, ''))
+            INSERT INTO {SCHEMA}.documents (org_slug, file_id, filename, status)
+            VALUES ($1, $2, $3, 'processing')
+            ON CONFLICT (org_slug, file_id)
             DO UPDATE SET status = 'processing', error = NULL, chunks_count = 0,
                          progress_phase = NULL, progress_detail = NULL,
                          updated_at = NOW()
             """,
+            org_slug,
             file_id,
             filename,
         )
 
 
 async def _record_failure(
+    org_slug: str,
     file_id: str,
     filename: str,
     error: str,
 ) -> None:
-    """Record failure status in documents table."""
+    """Record failure status in documents table (scoped to org)."""
     pool = await get_pool()
     async with acquire_with_retry(pool) as conn:
         await conn.execute(
             f"""
-            INSERT INTO {SCHEMA}.documents (file_id, filename, status, error)
-            VALUES ($1, $2, 'failed', $3)
-            ON CONFLICT (file_id, COALESCE(team_id, ''))
+            INSERT INTO {SCHEMA}.documents (org_slug, file_id, filename, status, error)
+            VALUES ($1, $2, $3, 'failed', $4)
+            ON CONFLICT (org_slug, file_id)
             DO UPDATE SET status = 'failed', error = EXCLUDED.error, chunks_count = 0,
                          progress_phase = NULL, progress_detail = NULL,
                          updated_at = NOW()
             """,
+            org_slug,
             file_id,
             filename,
             error,
@@ -176,6 +180,7 @@ async def _record_failure(
 
 
 async def _mark_completed(
+    org_slug: str,
     file_id: str,
 ) -> None:
     """Mark document status as completed and restore chunks_count from actual chunk rows."""
@@ -187,11 +192,13 @@ async def _mark_completed(
             SET status = 'completed',
                 error = NULL,
                 chunks_count = (
-                    SELECT COUNT(*) FROM {SCHEMA}.chunks c WHERE c.document_id = d.id
+                    SELECT COUNT(*) FROM {SCHEMA}.chunks c
+                    WHERE c.document_id = d.id AND c.org_slug = $1
                 ),
                 updated_at = NOW()
-            WHERE d.file_id = $1
+            WHERE d.org_slug = $1 AND d.file_id = $2
             """,
+            org_slug,
             file_id,
         )
 
@@ -223,7 +230,7 @@ async def _background_ingest(
             source_modified_at=source_modified_at,
         )
         if result.get("skipped"):
-            await _mark_completed(file_id)
+            await _mark_completed(org_slug, file_id)
         logger.info(
             "Background ingestion completed",
             extra={
@@ -239,7 +246,7 @@ async def _background_ingest(
             file_id,
         )
         try:
-            await _record_failure(file_id, filename, _sanitize_error(exc))
+            await _record_failure(org_slug, file_id, filename, _sanitize_error(exc))
         except Exception as record_exc:
             logger.critical("Could not record failure for {}: {}", file_id, record_exc)
     finally:
@@ -378,7 +385,7 @@ async def upload_document(
 
         doc_id = file_id or f"file-{uuid4().hex}"
 
-        await _insert_processing_row(doc_id, file.filename)
+        await _insert_processing_row(org_slug, doc_id, file.filename)
 
         if sync:
             try:
@@ -391,11 +398,11 @@ async def upload_document(
                     source_modified_at=source_modified_at,
                 )
             except Exception as sync_exc:
-                await _record_failure(doc_id, file.filename, _sanitize_error(sync_exc))
+                await _record_failure(org_slug, doc_id, file.filename, _sanitize_error(sync_exc))
                 raise
 
             if result.get("skipped"):
-                await _mark_completed(doc_id)
+                await _mark_completed(org_slug, doc_id)
 
             skipped = result.get("skipped", False)
             skip_reason = result.get("skip_reason")
@@ -438,10 +445,13 @@ async def upload_document(
 
 
 @router.delete("/documents/{file_id}", response_model=DocumentDeleteResponse)
-async def delete_document(file_id: str):
-    """Delete a document from the knowledge base by ID."""
+async def delete_document(
+    file_id: str,
+    org_slug: str = Depends(require_org_slug),
+):
+    """Delete a document from the knowledge base, scoped to caller's org."""
     try:
-        result = await rag_service.delete_document(file_id)
+        result = await rag_service.delete_document(org_slug, file_id)
 
         return DocumentDeleteResponse(
             success=result["success"],
@@ -462,11 +472,12 @@ async def delete_document(file_id: str):
 @router.get("/documents/{file_id}/content", response_model=DocumentContentResponse)
 async def get_document_content(
     file_id: str,
+    org_slug: str = Depends(require_org_slug),
     chunk_start: int = Query(default=1, ge=1, description="Start chunk (1-indexed)"),
     chunk_end: int | None = Query(default=None, ge=1, description="End chunk (1-indexed, inclusive)"),
     return_chunks: bool = Query(default=False, description="If true, include individual chunks as a list"),
 ):
-    """Retrieve full document text by reassembling stored chunks.
+    """Retrieve full document text by reassembling stored chunks, scoped to caller's org.
 
     Use chunk_start/chunk_end to paginate through large documents.
     Set return_chunks=true to get individual chunks as an array.
@@ -479,6 +490,7 @@ async def get_document_content(
 
     try:
         result = await rag_service.get_document_content(
+            org_slug,
             file_id,
             chunk_start=chunk_start,
             chunk_end=chunk_end,
@@ -501,8 +513,11 @@ async def get_document_content(
 
 
 @router.post("/documents/compare", response_model=DocumentCompareResponse)
-async def compare_documents(request: DocumentCompareRequest):
-    """Compare two documents using deterministic paragraph-level diffing.
+async def compare_documents(
+    request: DocumentCompareRequest,
+    org_slug: str = Depends(require_org_slug),
+):
+    """Compare two stored documents (both must belong to caller's org).
 
     Returns structured change blocks with context, statistics, and
     divergence detection.
@@ -515,6 +530,7 @@ async def compare_documents(request: DocumentCompareRequest):
 
     try:
         result = await rag_service.compare_documents(
+            org_slug,
             request.base_file_id,
             request.comparison_file_id,
             max_changes=request.max_changes,
@@ -590,13 +606,17 @@ async def compare_files(
 
 
 @router.post("/documents/statuses", response_model=DocumentStatusResponse)
-async def get_document_statuses(request: DocumentStatusRequest):
-    """Get statuses for multiple documents by ID.
+async def get_document_statuses(
+    request: DocumentStatusRequest,
+    org_slug: str = Depends(require_org_slug),
+):
+    """Get statuses for multiple documents by ID, scoped to caller's org.
 
-    Returns status info for each file_id, or null if not found.
+    Returns status info for each file_id, or null if the file doesn't
+    exist in `org_slug` (including IDs that exist for a different org).
     """
     try:
-        statuses_raw = await rag_service.get_document_statuses(request.file_ids)
+        statuses_raw = await rag_service.get_document_statuses(org_slug, request.file_ids)
         statuses = {
             did: DocumentStatusInfo(
                 status=info["status"],
diff --git a/services/rag/app/services/indexing_service.py b/services/rag/app/services/indexing_service.py
index 93a0625c1e..c7793d9b32 100644
--- a/services/rag/app/services/indexing_service.py
+++ b/services/rag/app/services/indexing_service.py
@@ -155,6 +155,7 @@ def _extract_file_dates(
 
 async def _update_progress(
     pool: asyncpg.Pool,
+    org_slug: str,
     file_id: str,
     phase: str,
     detail: str,
@@ -164,18 +165,20 @@ async def _update_progress(
         async with acquire_with_retry(pool) as conn:
             await conn.execute(
                 f"""UPDATE {SCHEMA}.documents
-                    SET progress_phase = $2, progress_detail = $3, updated_at = NOW()
-                    WHERE file_id = $1 AND status = 'processing'""",
+                    SET progress_phase = $3, progress_detail = $4, updated_at = NOW()
+                    WHERE org_slug = $1 AND file_id = $2 AND status = 'processing'""",
+                org_slug,
                 file_id,
                 phase,
                 detail,
             )
     except Exception:
-        logger.debug("Failed to update progress for {}", file_id)
+        logger.debug("Failed to update progress for {}/{}", org_slug, file_id)
 
 
 def _make_extraction_progress_callback(
     pool: asyncpg.Pool,
+    org_slug: str,
     file_id: str,
     loop: Any,
     *,
@@ -195,7 +198,7 @@ def on_progress(pages_done: int, total_pages: int) -> None:
             return
         last_flush = now
         detail = f"{pages_done}/{total_pages}"
-        loop.create_task(_update_progress(pool, file_id, "extracting", detail))
+        loop.create_task(_update_progress(pool, org_slug, file_id, "extracting", detail))
 
     return on_progress
 
@@ -259,15 +262,24 @@ async def prepare_document(
 
 async def find_existing_by_hash(
     pool: asyncpg.Pool,
+    org_slug: str,
     content_hash: str,
 ) -> uuid.UUID | None:
-    """Find a completed document with the given content hash (any scope).
+    """Find a completed document with the given content hash within `org_slug`.
 
-    Returns the internal UUID (documents.id) if found, else None.
+    Cross-org content-hash dedup is intentionally disabled — if org B
+    secretly uploaded a file with the same content as one in org A, org
+    A could probe for org B's documents by hash. Returns None for any
+    match outside the caller's org.
+
+    Returns the internal UUID (documents.id) if a same-org match exists.
     """
     async with acquire_with_retry(pool) as conn:
         row = await conn.fetchrow(
-            f"SELECT id FROM {SCHEMA}.documents WHERE content_hash = $1 AND status = 'completed' LIMIT 1",
+            f"""SELECT id FROM {SCHEMA}.documents
+                WHERE org_slug = $1 AND content_hash = $2 AND status = 'completed'
+                LIMIT 1""",
+            org_slug,
             content_hash,
         )
     return row["id"] if row else None
@@ -275,6 +287,7 @@ async def find_existing_by_hash(
 
 async def clone_from_existing(
     pool: asyncpg.Pool,
+    org_slug: str,
     source_doc_id: uuid.UUID,
     file_id: str,
     filename: str,
@@ -290,7 +303,9 @@ async def clone_from_existing(
     """
     async with acquire_with_retry(pool) as conn:
         existing = await conn.fetchrow(
-            f"SELECT id, content_hash FROM {SCHEMA}.documents WHERE file_id = $1",
+            f"""SELECT id, content_hash FROM {SCHEMA}.documents
+                WHERE org_slug = $1 AND file_id = $2""",
+            org_slug,
             file_id,
         )
 
@@ -308,6 +323,7 @@ async def clone_from_existing(
         try:
             result = await _do_clone(
                 pool,
+                org_slug,
                 source_doc_id,
                 file_id,
                 filename,
@@ -333,6 +349,7 @@ async def clone_from_existing(
 
 async def _do_clone(
     pool: asyncpg.Pool,
+    org_slug: str,
     source_doc_id: uuid.UUID,
     file_id: str,
     filename: str,
@@ -344,14 +361,19 @@ async def _do_clone(
     """Clone chunks from source document in a single transaction.
 
     Uses ON CONFLICT to atomically handle concurrent writes for the same
-    file_id.  Returns None if the source document has no chunks (e.g.
-    deleted concurrently).
+    (org_slug, file_id).  Returns None if the source document has no chunks
+    (e.g. deleted concurrently). Source and target must share `org_slug` —
+    `find_existing_by_hash` is org-scoped, so a foreign-org `source_doc_id`
+    shouldn't reach here; the chunks INSERT additionally filters by
+    `org_slug` as defense-in-depth.
     """
     async with acquire_with_retry(pool) as conn, conn.transaction():
         source = await conn.fetchrow(
             f"""SELECT chunks_count, source_created_at, source_modified_at
-                FROM {SCHEMA}.documents WHERE id = $1 AND status = 'completed'""",
+                FROM {SCHEMA}.documents
+                WHERE id = $1 AND org_slug = $2 AND status = 'completed'""",
             source_doc_id,
+            org_slug,
         )
         if not source:
             return None
@@ -359,10 +381,10 @@ async def _do_clone(
         doc_row = await conn.fetchrow(
             f"""
             INSERT INTO {SCHEMA}.documents
-                (file_id, filename, content_hash, status, chunks_count,
+                (org_slug, file_id, filename, content_hash, status, chunks_count,
                  source_created_at, source_modified_at)
-            VALUES ($1, $2, $3, 'completed', $4, $5, $6)
-            ON CONFLICT (file_id, COALESCE(team_id, ''))
+            VALUES ($1, $2, $3, $4, 'completed', $5, $6, $7)
+            ON CONFLICT (org_slug, file_id)
             DO UPDATE SET
                 filename = EXCLUDED.filename,
                 content_hash = EXCLUDED.content_hash,
@@ -376,6 +398,7 @@ async def _do_clone(
                 updated_at = NOW()
             RETURNING id, (xmax = 0) AS is_insert
             """,
+            org_slug,
             file_id,
             filename,
             content_hash,
@@ -388,25 +411,28 @@ async def _do_clone(
         # On UPDATE (not a fresh insert), remove old chunks first
         if not doc_row["is_insert"]:
             await conn.execute(
-                f"DELETE FROM {SCHEMA}.chunks WHERE document_id = $1",
+                f"DELETE FROM {SCHEMA}.chunks WHERE document_id = $1 AND org_slug = $2",
                 doc_uuid,
+                org_slug,
             )
 
         chunks_created = await conn.fetchval(
             f"""
             WITH inserted AS (
                 INSERT INTO {SCHEMA}.chunks
-                    (document_id, chunk_index, chunk_content, content_hash, embedding,
+                    (document_id, org_slug, chunk_index, chunk_content,
+                     content_hash, embedding,
                      core_content, prefix_overlap, suffix_overlap)
-                SELECT $1, chunk_index, chunk_content, content_hash, embedding,
+                SELECT $1, $2, chunk_index, chunk_content, content_hash, embedding,
                        core_content, prefix_overlap, suffix_overlap
                 FROM {SCHEMA}.chunks
-                WHERE document_id = $2
+                WHERE document_id = $3 AND org_slug = $2
                 RETURNING 1
             )
             SELECT count(*) FROM inserted
             """,
             doc_uuid,
+            org_slug,
             source_doc_id,
         )
 
@@ -429,6 +455,7 @@ async def _reindex_chunks_hnsw(pool: asyncpg.Pool) -> None:
 
 async def _do_store(
     pool: asyncpg.Pool,
+    org_slug: str,
     file_id: str,
     filename: str,
     prepared: PreparedDocument,
@@ -436,9 +463,9 @@ async def _do_store(
     """Upsert document and replace chunks in a single transaction.
 
     Uses ON CONFLICT to atomically handle concurrent writes for the same
-    file_id.  A WHERE clause on content_hash skips the update (and chunk
-    replacement) when the content hasn't changed — this is the atomic
-    equivalent of the old pre-transaction dedup SELECT.
+    (org_slug, file_id). A WHERE clause on content_hash skips the update
+    (and chunk replacement) when the content hasn't changed — this is the
+    atomic equivalent of the old pre-transaction dedup SELECT.
 
     When the WHERE filters out the update, RETURNING yields no rows —
     we treat that as "content unchanged, skip".
@@ -447,10 +474,10 @@ async def _do_store(
         doc_row = await conn.fetchrow(
             f"""
                 INSERT INTO {SCHEMA}.documents
-                    (file_id, filename, content_hash, status, chunks_count,
+                    (org_slug, file_id, filename, content_hash, status, chunks_count,
                      source_created_at, source_modified_at, ocr_applied)
-                VALUES ($1, $2, $3, 'completed', $4, $5, $6, $7)
-                ON CONFLICT (file_id, COALESCE(team_id, ''))
+                VALUES ($1, $2, $3, $4, 'completed', $5, $6, $7, $8)
+                ON CONFLICT (org_slug, file_id)
                 DO UPDATE SET
                     filename = EXCLUDED.filename,
                     content_hash = EXCLUDED.content_hash,
@@ -466,6 +493,7 @@ async def _do_store(
                 WHERE {SCHEMA}.documents.content_hash IS DISTINCT FROM EXCLUDED.content_hash
                 RETURNING id, (xmax = 0) AS is_insert
                 """,
+            org_slug,
             file_id,
             filename,
             prepared.content_hash,
@@ -490,13 +518,15 @@ async def _do_store(
         # On UPDATE (not a fresh insert), remove old chunks first
         if not doc_row["is_insert"]:
             await conn.execute(
-                f"DELETE FROM {SCHEMA}.chunks WHERE document_id = $1",
+                f"DELETE FROM {SCHEMA}.chunks WHERE document_id = $1 AND org_slug = $2",
                 doc_uuid,
+                org_slug,
             )
 
         chunk_rows = [
             (
                 doc_uuid,
+                org_slug,
                 chunk.index,
                 chunk.content,
                 compute_content_hash(chunk.content.encode("utf-8")),
@@ -510,10 +540,10 @@ async def _do_store(
         await conn.executemany(
             f"""
                 INSERT INTO {SCHEMA}.chunks
-                    (document_id, chunk_index, chunk_content,
+                    (document_id, org_slug, chunk_index, chunk_content,
                      content_hash, embedding,
                      core_content, prefix_overlap, suffix_overlap)
-                VALUES ($1, $2, $3, $4, $5::vector, $6, $7, $8)
+                VALUES ($1, $2, $3, $4, $5, $6::vector, $7, $8, $9)
                 """,
             chunk_rows,
         )
@@ -529,6 +559,7 @@ async def _do_store(
 
 async def store_prepared_document(
     pool: asyncpg.Pool,
+    org_slug: str,
     file_id: str,
     filename: str,
     prepared: PreparedDocument,
@@ -541,7 +572,7 @@ async def store_prepared_document(
     """
     for attempt in range(2):
         try:
-            result = await _do_store(pool, file_id, filename, prepared)
+            result = await _do_store(pool, org_slug, file_id, filename, prepared)
             if result["skipped"]:
                 logger.info("Document {} content unchanged, skipping", file_id)
             else:
@@ -560,6 +591,7 @@ async def store_prepared_document(
 
 async def index_document(
     pool: asyncpg.Pool,
+    org_slug: str,
     file_id: str,
     content_bytes: bytes,
     filename: str,
@@ -573,21 +605,24 @@ async def index_document(
 ) -> dict[str, Any]:
     """Index a document: extract, chunk, embed, and store.
 
-    Attempts content-hash dedup first: if another document already has the same
-    content, clone its chunks instead of re-extracting/embedding.
+    Attempts content-hash dedup first: if another document IN THE SAME ORG
+    already has the same content, clone its chunks instead of re-extracting/
+    embedding. Cross-org dedup is intentionally not attempted (see
+    `find_existing_by_hash` docstring).
     """
     content_hash = compute_content_hash(content_bytes)
 
-    # Fast path: same file_id with unchanged content AND chunks already stored —
-    # skip immediately instead of re-extracting/embedding.
+    # Fast path: same file_id within this org with unchanged content AND
+    # chunks already stored — skip immediately instead of re-extracting.
     async with acquire_with_retry(pool) as conn:
         own_row = await conn.fetchrow(
             f"""SELECT d.content_hash,
                        (SELECT COUNT(*)
                         FROM {SCHEMA}.chunks c
-                        WHERE c.document_id = d.id) AS chunk_count
+                        WHERE c.document_id = d.id AND c.org_slug = $1) AS chunk_count
                 FROM {SCHEMA}.documents d
-                WHERE d.file_id = $1""",
+                WHERE d.org_slug = $1 AND d.file_id = $2""",
+            org_slug,
             file_id,
         )
     if own_row and own_row["content_hash"] == content_hash and own_row["chunk_count"] > 0:
@@ -603,7 +638,8 @@ async def index_document(
                         progress_phase = NULL,
                         progress_detail = NULL,
                         updated_at = NOW()
-                    WHERE file_id = $1""",
+                    WHERE org_slug = $1 AND file_id = $2""",
+                org_slug,
                 file_id,
             )
         return {
@@ -614,11 +650,12 @@ async def index_document(
             "skip_reason": "content_unchanged",
         }
 
-    source_id = await find_existing_by_hash(pool, content_hash)
+    source_id = await find_existing_by_hash(pool, org_slug, content_hash)
 
     if source_id is not None:
         result = await clone_from_existing(
             pool,
+            org_slug,
             source_id,
             file_id,
             filename,
@@ -633,9 +670,9 @@ async def index_document(
     import asyncio as _aio
 
     loop = _aio.get_running_loop()
-    extraction_cb = _make_extraction_progress_callback(pool, file_id, loop)
+    extraction_cb = _make_extraction_progress_callback(pool, org_slug, file_id, loop)
 
-    await _update_progress(pool, file_id, "extracting", "")
+    await _update_progress(pool, org_slug, file_id, "extracting", "")
 
     prepared = await prepare_document(
         content_bytes,
@@ -658,6 +695,7 @@ async def index_document(
 
     await _update_progress(
         pool,
+        org_slug,
         file_id,
         "embedding",
         f"{len(prepared.chunks)} chunks",
@@ -670,10 +708,11 @@ async def index_document(
             source_modified_at=source_modified_at or prepared.source_modified_at,
         )
 
-    await _update_progress(pool, file_id, "storing", "")
+    await _update_progress(pool, org_slug, file_id, "storing", "")
 
     return await store_prepared_document(
         pool,
+        org_slug,
         file_id,
         filename,
         prepared,
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index 57cb60e9e2..d6bd5cefab 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -1,12 +1,19 @@
 """Main RAG service.
 
-Provides: add_document, search, generate, delete_document.
-All operations use the private_knowledge schema in tale_knowledge database.
+Provides: add_document, search, generate, delete_document,
+get_document_content, get_document_statuses, compare_documents,
+compare_files.
 
-Multi-org: each public method requires an `org_slug` so the LLM /
-embedding / vision clients used for the call come from THAT org's
-provider catalog at `<TALE_CONFIG_DIR>/<org>/providers/`. Per-org client
-state is built lazily and cached for `_CONFIG_CHECK_INTERVAL` seconds.
+All public methods take `org_slug` as their first argument so the SQL
+layer can scope by `org_slug` and the per-org LLM / embedding / vision
+clients can be loaded from THAT org's provider catalog at
+`<TALE_CONFIG_DIR>/<org>/providers/`. Per-org client state is built
+lazily and cached for `_CONFIG_CHECK_INTERVAL` seconds.
+
+Tenant isolation is enforced at the data layer: `documents` and `chunks`
+both carry an `org_slug` column (NOT NULL DEFAULT 'default') and a
+composite FK ties chunks.org_slug to documents.org_slug. Every SELECT /
+UPDATE / DELETE / INSERT filters by `org_slug`.
 
 Embedding **dimensions** are global: the underlying knowledge DB uses
 one vector column, so all orgs sharing this RAG instance must use the
@@ -327,6 +334,7 @@ async def add_document(
 
         return await index_document(
             self._pool,
+            org_slug,
             file_id,
             content,
             filename,
@@ -347,41 +355,39 @@ async def search(
         similarity_threshold: float | None = None,
         file_ids: list[str] | None = None,
     ) -> tuple[list[dict[str, Any]], Any]:
-        """Search the knowledge base using hybrid BM25 + vector search.
+        """Search the knowledge base scoped to `org_slug`.
 
-        Returns a `(results, embedding_usage)` tuple so the per-call
-        embedding usage is propagated alongside the results — earlier
-        this hung on a mutable `self.last_search_usage` attribute that
-        concurrent calls overwrote, mis-attributing tokens across
-        callers under any real QPS.
+        Returns a `(results, embedding_usage)` tuple — the underlying
+        `RagSearchService.search` returns the tuple directly so there's
+        no shared singleton attribution race across concurrent callers.
         """
         clients = await self._ensure_org_clients(org_slug)
 
         effective_top_k = top_k if top_k is not None else settings.top_k
         threshold = similarity_threshold if similarity_threshold is not None else settings.similarity_threshold
 
-        results = await clients.search_service.search(
+        results, usage = await clients.search_service.search(
+            org_slug,
             query,
             file_ids=file_ids,
             top_k=effective_top_k,
             similarity_threshold=threshold,
         )
-        usage = getattr(clients.search_service, "last_search_usage", None)
 
         # If no results and some files are still indexing, wait and retry once
         if not results and file_ids:
-            statuses = await self.get_document_statuses(file_ids)
+            statuses = await self.get_document_statuses(org_slug, file_ids)
             has_processing = any(s is not None and s.get("status") == "processing" for s in statuses.values())
             if has_processing:
                 logger.info("No results and some files still indexing, retrying in 3s")
                 await asyncio.sleep(3)
-                results = await clients.search_service.search(
+                results, usage = await clients.search_service.search(
+                    org_slug,
                     query,
                     file_ids=file_ids,
                     top_k=effective_top_k,
                     similarity_threshold=threshold,
                 )
-                usage = getattr(clients.search_service, "last_search_usage", None)
 
         return results, usage
 
@@ -479,17 +485,18 @@ async def generate(
 
     async def get_document_content(
         self,
+        org_slug: str,
         file_id: str,
         *,
         chunk_start: int = 1,
         chunk_end: int | None = None,
         return_chunks: bool = False,
     ) -> dict[str, Any] | None:
-        """Retrieve document content by reassembling stored chunks.
+        """Retrieve document content by reassembling stored chunks, scoped to org.
 
-        Does not require an org slug: documents are looked up by file_id
-        in the shared knowledge schema. Access control / tenancy is
-        enforced at the platform → RAG boundary.
+        Returns None for documents that don't exist in `org_slug` — including
+        documents that exist for a different org (no cross-tenant disclosure
+        via 200 vs 404 differential).
         """
         if not self.initialized:
             await self.initialize()
@@ -500,14 +507,15 @@ async def get_document_content(
         if chunk_end is None:
             chunk_end = chunk_start + self.MAX_CHUNK_WINDOW - 1
 
-        where = "file_id = $1"
-        params: list[Any] = [file_id]
-
         async with acquire_with_retry(self._pool) as conn:
             doc = await conn.fetchrow(
-                f"SELECT id, file_id, filename, chunks_count, source_created_at, source_modified_at"
-                f" FROM {SCHEMA}.documents WHERE {where} LIMIT 1",
-                *params,
+                f"""SELECT id, file_id, filename, chunks_count,
+                           source_created_at, source_modified_at
+                    FROM {SCHEMA}.documents
+                    WHERE org_slug = $1 AND file_id = $2
+                    LIMIT 1""",
+                org_slug,
+                file_id,
             )
 
             if doc is None:
@@ -517,14 +525,18 @@ async def get_document_content(
             total_chunks = doc["chunks_count"]
 
             # Convert 1-indexed API params to 0-indexed chunk_index
-            chunk_params: list[Any] = [doc_uuid, chunk_start - 1, chunk_end - 1]
-
             rows = await conn.fetch(
-                f"SELECT chunk_index, chunk_content, core_content "
-                f"FROM {SCHEMA}.chunks "
-                f"WHERE document_id = $1 AND chunk_index >= $2 AND chunk_index <= $3 "
-                f"ORDER BY chunk_index ASC",
-                *chunk_params,
+                f"""SELECT chunk_index, chunk_content, core_content
+                    FROM {SCHEMA}.chunks
+                    WHERE org_slug = $1
+                      AND document_id = $2
+                      AND chunk_index >= $3
+                      AND chunk_index <= $4
+                    ORDER BY chunk_index ASC""",
+                org_slug,
+                doc_uuid,
+                chunk_start - 1,
+                chunk_end - 1,
             )
 
         if not rows:
@@ -568,12 +580,14 @@ async def get_document_content(
 
     async def get_document_statuses(
         self,
+        org_slug: str,
         file_ids: list[str],
     ) -> dict[str, dict[str, Any] | None]:
-        """Get statuses for multiple documents by file_id.
+        """Get statuses for multiple documents by file_id, scoped to org.
 
-        Returns a dict mapping file_id to status info or None if not found.
-        Org-agnostic (status lookup uses the shared knowledge schema).
+        Returns a dict mapping file_id → status info, or None for IDs that
+        don't exist in `org_slug` (including IDs that exist for a different
+        org — those return None too, to avoid cross-tenant disclosure).
         """
         if not self.initialized:
             await self.initialize()
@@ -588,7 +602,7 @@ async def get_document_statuses(
                     file_id, status, error, progress_phase, progress_detail,
                     source_created_at, source_modified_at, ocr_applied
                 FROM {SCHEMA}.documents
-                WHERE file_id = ANY($1)
+                WHERE org_slug = $1 AND file_id = ANY($2)
                 ORDER BY file_id,
                     CASE status
                         WHEN 'processing' THEN 0
@@ -598,6 +612,7 @@ async def get_document_statuses(
                     END,
                     updated_at DESC
                 """,
+                org_slug,
                 file_ids,
             )
 
@@ -617,12 +632,16 @@ async def get_document_statuses(
 
     async def delete_document(
         self,
+        org_slug: str,
         file_id: str,
     ) -> dict[str, Any]:
-        """Delete a document and its chunks from the knowledge base.
+        """Delete a document (and its chunks via FK CASCADE) within `org_slug`.
 
-        Org-agnostic: file_id is globally unique in this schema. Access
-        control is enforced at the platform → RAG boundary.
+        Scoped to `org_slug`: a foreign-org file_id will return zero
+        deletions rather than touching another tenant's data. The composite
+        FK on (document_id, org_slug) means chunks cascade automatically,
+        but we still scope the DELETE on chunks first to keep the
+        transaction explicit.
         """
         if not self.initialized:
             await self.initialize()
@@ -634,7 +653,9 @@ async def delete_document(
 
         async with acquire_with_retry(self._pool) as conn:
             rows = await conn.fetch(
-                f"SELECT id FROM {SCHEMA}.documents WHERE file_id = $1",
+                f"""SELECT id FROM {SCHEMA}.documents
+                    WHERE org_slug = $1 AND file_id = $2""",
+                org_slug,
                 file_id,
             )
 
@@ -652,11 +673,15 @@ async def delete_document(
 
         async with acquire_with_retry(self._pool) as conn, conn.transaction():
             await conn.execute(
-                f"DELETE FROM {SCHEMA}.chunks WHERE document_id = ANY($1)",
+                f"""DELETE FROM {SCHEMA}.chunks
+                    WHERE org_slug = $1 AND document_id = ANY($2)""",
+                org_slug,
                 ids_to_delete,
             )
             await conn.execute(
-                f"DELETE FROM {SCHEMA}.documents WHERE id = ANY($1)",
+                f"""DELETE FROM {SCHEMA}.documents
+                    WHERE org_slug = $1 AND id = ANY($2)""",
+                org_slug,
                 ids_to_delete,
             )
 
@@ -672,20 +697,18 @@ async def delete_document(
 
     async def compare_documents(
         self,
+        org_slug: str,
         base_file_id: str,
         comparison_file_id: str,
         *,
         max_changes: int = 500,
     ) -> dict[str, Any] | None:
-        """Compare two documents using deterministic paragraph-level diffing.
-
-        Org-agnostic — operates on stored documents by file_id.
-        """
+        """Compare two stored documents (both must belong to `org_slug`)."""
         from .diff_service import compute_diff
 
         base, comp = await asyncio.gather(
-            self.get_document_content(base_file_id),
-            self.get_document_content(comparison_file_id),
+            self.get_document_content(org_slug, base_file_id),
+            self.get_document_content(org_slug, comparison_file_id),
         )
 
         if base is None:
diff --git a/services/rag/app/services/search_service.py b/services/rag/app/services/search_service.py
index 1b9d00117b..bd4c70dac1 100644
--- a/services/rag/app/services/search_service.py
+++ b/services/rag/app/services/search_service.py
@@ -1,7 +1,9 @@
 """Hybrid search service for the RAG pipeline.
 
 BM25 full-text (pg_search) + pgvector similarity with RRF fusion.
-Scoping via file_ids. Optional semantic caching and cross-encoder re-ranking.
+Per-tenant scoping by `org_slug` (always applied), optional further
+restriction by `file_ids`. Optional semantic caching and cross-encoder
+re-ranking.
 """
 
 from __future__ import annotations
@@ -50,41 +52,49 @@ def __init__(self, pool: asyncpg.Pool, embedding_service: EmbeddingService):
 
     async def search(
         self,
+        org_slug: str,
         query: str,
         *,
         file_ids: list[str] | None = None,
         top_k: int = 10,
         similarity_threshold: float = 0.0,
-    ) -> list[dict[str, Any]]:
-        """Hybrid BM25 + vector search with document scoping.
+    ) -> tuple[list[dict[str, Any]], EmbeddingUsage]:
+        """Hybrid BM25 + vector search scoped to `org_slug`.
 
         Args:
+            org_slug: Tenant slug — ALWAYS used to filter `chunks.org_slug`
+                so no cross-tenant rows can match.
             query: Search query text.
-            file_ids: Optional file IDs to restrict search to.
+            file_ids: Optional file IDs to further restrict search to. The
+                org filter is independent — file_ids only narrows within
+                the org.
             top_k: Maximum number of results to return.
             similarity_threshold: Minimum cosine similarity for vector results.
                 Results below this threshold are discarded before RRF merge.
 
         Returns:
-            List of result dicts with content, score, file_id.
-            Embedding token usage available via `self.last_search_usage` after call.
+            ``(results, embedding_usage)`` — usage returned alongside the
+            results so concurrent callers can't trample each other's
+            attribution via a shared singleton.
         """
         query_embedding: list[float] | None = None
-        self.last_search_usage = EmbeddingUsage(model=self._embedding._model)
+        usage = EmbeddingUsage(model=self._embedding._model)
         try:
             t0 = time.time()
             query_result, fts_results = await asyncio.gather(
                 _timed("embed", self._embedding.embed_query_with_usage(query)),
-                _timed("fts", self._fts_search(query, file_ids, top_k * 3)),
+                _timed("fts", self._fts_search(query, org_slug, file_ids, top_k * 3)),
             )
             query_embedding = query_result.embedding
-            self.last_search_usage = query_result.usage
+            usage = query_result.usage
             logger.debug("PERF embed+FTS total: {:.1f}ms", (time.time() - t0) * 1000)
 
-            # Semantic cache: check for a cached result before vector search
+            # Semantic cache: check for a cached result before vector search.
+            # Cache is org-scoped — see SemanticCache.lookup.
             if self._semantic_cache and query_embedding:
                 cache_t0 = time.time()
                 cached = await self._semantic_cache.lookup(
+                    org_slug,
                     query_embedding,
                     threshold=settings.semantic_cache_similarity_threshold,
                 )
@@ -95,12 +105,12 @@ async def search(
                         cached_results = json.loads(cached.response_text)
                         for r in cached_results:
                             r["cached"] = True
-                        return cached_results
+                        return cached_results, usage
                     except (json.JSONDecodeError, TypeError):
                         logger.warning("Invalid cached response format, performing fresh search")
 
             vec_t0 = time.time()
-            vector_results = await self._vector_search(query_embedding, file_ids, top_k * 3)
+            vector_results = await self._vector_search(query_embedding, org_slug, file_ids, top_k * 3)
             vec_ms = (time.time() - vec_t0) * 1000
             logger.debug("PERF vector search: {:.1f}ms", vec_ms)
 
@@ -120,10 +130,10 @@ async def search(
                         top_score,
                     )
                 if pre_count > 0 and not vector_results:
-                    return []
+                    return [], usage
 
             if not fts_results and not vector_results:
-                return []
+                return [], usage
 
             merged = merge_rrf([fts_results, vector_results], top_k)
 
@@ -166,10 +176,11 @@ async def search(
                 for item in merged
             ]
 
-            # Semantic cache: store results for future lookups
+            # Semantic cache: store results for future lookups (org-scoped).
             if self._semantic_cache and query_embedding and results:
                 result_file_ids = [r["file_id"] for r in results if r.get("file_id")]
                 await self._semantic_cache.store(
+                    org_slug,
                     query,
                     query_embedding,
                     json.dumps(results, default=str),
@@ -177,14 +188,14 @@ async def search(
                     file_ids=result_file_ids,
                 )
 
-            return results
+            return results, usage
 
         except asyncpg.UndefinedTableError:
             logger.info("Tables not yet created, returning empty results")
-            return []
+            return [], usage
         except asyncpg.UndefinedColumnError:
             logger.info("Schema not ready, returning empty results")
-            return []
+            return [], usage
         except (asyncpg.InternalServerError, asyncpg.DataCorruptedError) as e:
             is_bm25 = "bm25" in str(e).lower()
             is_corruption = isinstance(e, asyncpg.DataCorruptedError)
@@ -199,8 +210,8 @@ async def search(
 
                 if query_embedding is None:
                     query_embedding = await self._embedding.embed_query(query)
-                vector_results = await self._vector_search(query_embedding, file_ids, top_k)
-                return [
+                vector_results = await self._vector_search(query_embedding, org_slug, file_ids, top_k)
+                results = [
                     {
                         "content": item.get("core_content") or item.get("chunk_content") or "",
                         "score": item["score"],
@@ -211,16 +222,37 @@ async def search(
                     }
                     for item in vector_results
                 ]
+                return results, usage
             raise
 
-    def _build_scope_clause(self, file_ids: list[str] | None, param_offset: int) -> tuple[str, list[Any]]:
-        """Build WHERE clause for document scoping."""
-        if not file_ids:
-            return "", []
+    def _build_scope_clause(
+        self,
+        org_slug: str,
+        file_ids: list[str] | None,
+        param_offset: int,
+    ) -> tuple[str, list[Any]]:
+        """Build WHERE clause for per-tenant + optional file scoping.
+
+        `org_slug` is ALWAYS added as `AND c.org_slug = $N`. When `file_ids`
+        is non-empty, a secondary clause restricts the documents subquery
+        AND is itself org-scoped (defense in depth — even if the outer
+        chunks filter were ever bypassed by a code mistake, the inner
+        documents lookup also has org filter).
+        """
+        org_param = param_offset + 1
+        clause = f" AND c.org_slug = ${org_param}"
+        params: list[Any] = [org_slug]
+
+        if file_ids:
+            file_param = param_offset + 2
+            clause += (
+                f" AND c.document_id IN ("
+                f"SELECT id FROM {SCHEMA}.documents "
+                f"WHERE org_slug = ${org_param} AND file_id = ANY(${file_param}))"
+            )
+            params.append(file_ids)
 
-        idx = param_offset + 1
-        clause = f" AND c.document_id IN (SELECT id FROM {SCHEMA}.documents WHERE file_id = ANY(${idx}))"
-        return clause, [file_ids]
+        return clause, params
 
     async def _rebuild_bm25_index(self) -> None:
         """Rebuild the BM25 index after corruption. Runs as a background task."""
@@ -235,10 +267,11 @@ async def _rebuild_bm25_index(self) -> None:
     async def _fts_search(
         self,
         query: str,
+        org_slug: str,
         file_ids: list[str] | None,
         limit: int,
     ) -> list[dict[str, Any]]:
-        tenant_clause, tenant_params = self._build_scope_clause(file_ids, 1)
+        tenant_clause, tenant_params = self._build_scope_clause(org_slug, file_ids, 1)
 
         sql = f"""
             SELECT c.id, c.chunk_content, c.core_content, c.chunk_index, c.document_id,
@@ -268,11 +301,12 @@ async def _fts_search(
     async def _vector_search(
         self,
         embedding: list[float],
+        org_slug: str,
         file_ids: list[str] | None,
         limit: int,
     ) -> list[dict[str, Any]]:
         vec_str = json.dumps(embedding)
-        tenant_clause, tenant_params = self._build_scope_clause(file_ids, 1)
+        tenant_clause, tenant_params = self._build_scope_clause(org_slug, file_ids, 1)
 
         sql = f"""
             SELECT c.id, c.chunk_content, c.core_content, c.chunk_index, c.document_id,
diff --git a/services/rag/app/services/semantic_cache.py b/services/rag/app/services/semantic_cache.py
index 6e08fa2505..2380645d8b 100644
--- a/services/rag/app/services/semantic_cache.py
+++ b/services/rag/app/services/semantic_cache.py
@@ -1,8 +1,7 @@
 """Semantic cache for RAG search results.
 
-Two-tier approach: exact-match on query text, then cosine similarity
-on query embeddings. Stores results with TTL and supports invalidation
-by file IDs.
+Per-tenant cosine-similarity lookup keyed on `org_slug`. Stores results
+with TTL and supports invalidation by file IDs.
 """
 
 from __future__ import annotations
@@ -40,11 +39,17 @@ def __init__(self, pool: asyncpg.Pool):
         self._pool = pool
 
     async def ensure_table(self) -> None:
-        """Create the semantic_cache table if it does not exist."""
+        """Create the semantic_cache table if it does not exist.
+
+        Schema mirrors what the dbmate migration produces so a fresh
+        deployment that runs this before the migration still gets an
+        org-scoped table.
+        """
         async with acquire_with_retry(self._pool) as conn:
             await conn.execute(f"""
                 CREATE TABLE IF NOT EXISTS {SCHEMA}.semantic_cache (
                     id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                    org_slug TEXT NOT NULL DEFAULT 'default',
                     query_text TEXT NOT NULL,
                     query_embedding vector NOT NULL,
                     response_text TEXT NOT NULL,
@@ -55,16 +60,16 @@ async def ensure_table(self) -> None:
                     file_ids TEXT[] DEFAULT '{{}}'
                 )
             """)
-            # Create HNSW index for cosine similarity lookups
+            # HNSW index for cosine similarity lookups
             await conn.execute(f"""
                 CREATE INDEX IF NOT EXISTS idx_semantic_cache_embedding
                 ON {SCHEMA}.semantic_cache
                 USING hnsw (query_embedding vector_cosine_ops)
             """)
-            # Index for expiration cleanup
+            # Index for org-scoped expiration cleanup
             await conn.execute(f"""
-                CREATE INDEX IF NOT EXISTS idx_semantic_cache_expires_at
-                ON {SCHEMA}.semantic_cache (expires_at)
+                CREATE INDEX IF NOT EXISTS idx_pk_semcache_org_expires
+                ON {SCHEMA}.semantic_cache (org_slug, expires_at)
             """)
             # Index for file-based invalidation
             await conn.execute(f"""
@@ -74,18 +79,25 @@ async def ensure_table(self) -> None:
 
     async def lookup(
         self,
+        org_slug: str,
         query_embedding: list[float],
         *,
         threshold: float = 0.95,
     ) -> CacheEntry | None:
-        """Find a cached result by cosine similarity.
+        """Find a cached result by cosine similarity within `org_slug`.
+
+        Tenant scoping: ALWAYS filters by `org_slug = $org`. Two orgs with
+        semantically identical queries get independent cache entries — no
+        cross-tenant response leak via embedding similarity.
 
         Args:
+            org_slug: Tenant slug.
             query_embedding: Embedding vector for the query.
             threshold: Minimum cosine similarity (0.0 to 1.0).
 
         Returns:
-            CacheEntry if a sufficiently similar cached query exists, else None.
+            CacheEntry if a sufficiently similar cached query exists in
+            this org, else None.
         """
         vec_str = json.dumps(query_embedding)
         now = datetime.now(UTC)
@@ -97,12 +109,14 @@ async def lookup(
                     SELECT query_text, response_text, metadata, hit_count, created_at,
                            1 - (query_embedding <=> $1::vector) AS similarity
                     FROM {SCHEMA}.semantic_cache
-                    WHERE expires_at > $2
-                      AND 1 - (query_embedding <=> $1::vector) >= $3
+                    WHERE org_slug = $2
+                      AND expires_at > $3
+                      AND 1 - (query_embedding <=> $1::vector) >= $4
                     ORDER BY query_embedding <=> $1::vector
                     LIMIT 1
                     """,
                     vec_str,
+                    org_slug,
                     now,
                     threshold,
                 )
@@ -110,13 +124,14 @@ async def lookup(
                 if row is None:
                     return None
 
-                # Increment hit count (fire-and-forget)
+                # Increment hit count (org-scoped; fire-and-forget)
                 await conn.execute(
                     f"""
                     UPDATE {SCHEMA}.semantic_cache
                     SET hit_count = hit_count + 1
-                    WHERE query_text = $1 AND expires_at > $2
+                    WHERE org_slug = $1 AND query_text = $2 AND expires_at > $3
                     """,
+                    org_slug,
                     row["query_text"],
                     now,
                 )
@@ -141,6 +156,7 @@ async def lookup(
 
     async def store(
         self,
+        org_slug: str,
         query: str,
         embedding: list[float],
         response: str,
@@ -149,9 +165,10 @@ async def store(
         ttl_hours: int = 24,
         file_ids: list[str] | None = None,
     ) -> None:
-        """Store a query-response pair in the cache.
+        """Store a query-response pair in the cache, scoped to `org_slug`.
 
         Args:
+            org_slug: Tenant slug.
             query: Original query text.
             embedding: Query embedding vector.
             response: Response text to cache.
@@ -169,9 +186,11 @@ async def store(
                 await conn.execute(
                     f"""
                     INSERT INTO {SCHEMA}.semantic_cache
-                        (query_text, query_embedding, response_text, metadata, expires_at, file_ids)
-                    VALUES ($1, $2::vector, $3, $4::jsonb, $5, $6)
+                        (org_slug, query_text, query_embedding, response_text,
+                         metadata, expires_at, file_ids)
+                    VALUES ($1, $2, $3::vector, $4, $5::jsonb, $6, $7)
                     """,
+                    org_slug,
                     query,
                     vec_str,
                     response,
@@ -184,10 +203,14 @@ async def store(
         except Exception:
             logger.warning("Semantic cache store failed", exc_info=True)
 
-    async def invalidate(self, file_ids: list[str]) -> int:
-        """Remove cache entries referencing any of the given file IDs.
+    async def invalidate(self, org_slug: str, file_ids: list[str]) -> int:
+        """Remove this org's cache entries referencing any of the given file IDs.
+
+        Org-scoped: org A invalidating a file_id never touches org B's cache,
+        even if both orgs happen to use the same file_id string.
 
         Args:
+            org_slug: Tenant slug.
             file_ids: File IDs whose cached entries should be purged.
 
         Returns:
@@ -201,13 +224,19 @@ async def invalidate(self, file_ids: list[str]) -> int:
                 result = await conn.execute(
                     f"""
                     DELETE FROM {SCHEMA}.semantic_cache
-                    WHERE file_ids && $1
+                    WHERE org_slug = $1 AND file_ids && $2
                     """,
+                    org_slug,
                     file_ids,
                 )
                 count = int(result.split()[-1]) if result else 0
                 if count > 0:
-                    logger.info("Invalidated {} semantic cache entries for file_ids={}", count, file_ids)
+                    logger.info(
+                        "Invalidated {} semantic cache entries for org={} file_ids={}",
+                        count,
+                        org_slug,
+                        file_ids,
+                    )
                 return count
         except (asyncpg.UndefinedTableError, asyncpg.UndefinedColumnError):
             return 0
@@ -215,25 +244,43 @@ async def invalidate(self, file_ids: list[str]) -> int:
             logger.warning("Semantic cache invalidation failed", exc_info=True)
             return 0
 
-    async def cleanup(self) -> int:
+    async def cleanup(self, org_slug: str | None = None) -> int:
         """Remove expired cache entries.
 
+        When `org_slug` is provided, only purges that org's expired rows.
+        When `None`, purges expired rows across all orgs — intended for
+        operator-side periodic GC; per-org lazy cleanup must pass a slug.
+
         Returns:
             Number of entries deleted.
         """
         now = datetime.now(UTC)
         try:
             async with acquire_with_retry(self._pool) as conn:
-                result = await conn.execute(
-                    f"""
-                    DELETE FROM {SCHEMA}.semantic_cache
-                    WHERE expires_at <= $1
-                    """,
-                    now,
-                )
+                if org_slug is None:
+                    result = await conn.execute(
+                        f"""
+                        DELETE FROM {SCHEMA}.semantic_cache
+                        WHERE expires_at <= $1
+                        """,
+                        now,
+                    )
+                else:
+                    result = await conn.execute(
+                        f"""
+                        DELETE FROM {SCHEMA}.semantic_cache
+                        WHERE org_slug = $1 AND expires_at <= $2
+                        """,
+                        org_slug,
+                        now,
+                    )
                 count = int(result.split()[-1]) if result else 0
                 if count > 0:
-                    logger.info("Cleaned up {} expired semantic cache entries", count)
+                    logger.info(
+                        "Cleaned up {} expired semantic cache entries (org={})",
+                        count,
+                        org_slug or "<all>",
+                    )
                 return count
         except (asyncpg.UndefinedTableError, asyncpg.UndefinedColumnError):
             return 0
diff --git a/services/rag/app/utils/sanitize.py b/services/rag/app/utils/sanitize.py
index db47e3b39a..e62d397a88 100644
--- a/services/rag/app/utils/sanitize.py
+++ b/services/rag/app/utils/sanitize.py
@@ -1,27 +1,27 @@
-"""Team ID sanitization for multi-tenant document storage."""
+"""Org slug sanitization for multi-tenant document storage."""
 
 import re
 
 
-def sanitize_team_id(team_id: str) -> str:
-    """Sanitize a team_id by replacing invalid characters.
+def sanitize_org_slug(org_slug: str) -> str:
+    """Sanitize an org_slug by replacing invalid characters.
 
     - Spaces and dots replaced with underscores
     - Non-alphanumeric/underscore/hyphen characters removed
     - Collapses multiple underscores, strips leading/trailing underscores
 
     Raises:
-        ValueError: If team_id sanitizes to empty string.
+        ValueError: If org_slug sanitizes to empty string.
     """
-    if not team_id:
-        raise ValueError("team_id must not be empty")
+    if not org_slug:
+        raise ValueError("org_slug must not be empty")
 
-    result = team_id.replace(" ", "_").replace(".", "_")
+    result = org_slug.replace(" ", "_").replace(".", "_")
     result = re.sub(r"[^a-zA-Z0-9_-]", "", result)
     result = re.sub(r"_+", "_", result)
     result = result.strip("_")
 
     if not result:
-        raise ValueError(f"team_id '{team_id}' sanitized to empty string")
+        raise ValueError(f"org_slug '{org_slug}' sanitized to empty string")
 
     return result
diff --git a/services/rag/migrations/20260528000001_enforce_org_slug.sql b/services/rag/migrations/20260528000001_enforce_org_slug.sql
new file mode 100644
index 0000000000..22b86be26a
--- /dev/null
+++ b/services/rag/migrations/20260528000001_enforce_org_slug.sql
@@ -0,0 +1,138 @@
+-- migrate:up
+-- Enforce per-tenant data isolation at the data layer.
+--
+-- The legacy `team_id` column existed in `documents` and `chunks` but was
+-- never populated; rename it to `org_slug`, backfill the implicit
+-- single-tenant 'default' value, make it NOT NULL DEFAULT 'default', and
+-- add a cross-table FK so chunks.org_slug must match documents.org_slug.
+-- Add `org_slug` to `semantic_cache` so the embedding-similarity lookup
+-- cannot pull another org's cached response.
+--
+-- Every statement is idempotent: each DDL guards on current state via
+-- information_schema / pg_constraint so re-running this migration (manually
+-- with psql, or after a backup restore that lost the schema_migrations
+-- record) is a no-op.
+
+----------------------------------------------------------------
+-- 1) documents: backfill NULL team_id, then rename → org_slug
+----------------------------------------------------------------
+DO $$
+BEGIN
+    IF EXISTS (
+        SELECT 1 FROM information_schema.columns
+        WHERE table_schema = 'private_knowledge'
+          AND table_name   = 'documents'
+          AND column_name  = 'team_id'
+    ) THEN
+        UPDATE private_knowledge.documents
+        SET    team_id = 'default'
+        WHERE  team_id IS NULL;
+
+        ALTER TABLE private_knowledge.documents
+        RENAME COLUMN team_id TO org_slug;
+    END IF;
+END;
+$$;
+
+-- ALTER ... SET DEFAULT / SET NOT NULL are idempotent in Postgres
+-- (setting to the same value is a no-op). Safe to re-run.
+ALTER TABLE private_knowledge.documents
+    ALTER COLUMN org_slug SET DEFAULT 'default';
+ALTER TABLE private_knowledge.documents
+    ALTER COLUMN org_slug SET NOT NULL;
+
+----------------------------------------------------------------
+-- 2) chunks: same pattern
+----------------------------------------------------------------
+DO $$
+BEGIN
+    IF EXISTS (
+        SELECT 1 FROM information_schema.columns
+        WHERE table_schema = 'private_knowledge'
+          AND table_name   = 'chunks'
+          AND column_name  = 'team_id'
+    ) THEN
+        UPDATE private_knowledge.chunks
+        SET    team_id = 'default'
+        WHERE  team_id IS NULL;
+
+        ALTER TABLE private_knowledge.chunks
+        RENAME COLUMN team_id TO org_slug;
+    END IF;
+END;
+$$;
+
+ALTER TABLE private_knowledge.chunks
+    ALTER COLUMN org_slug SET DEFAULT 'default';
+ALTER TABLE private_knowledge.chunks
+    ALTER COLUMN org_slug SET NOT NULL;
+
+----------------------------------------------------------------
+-- 3) Drop stale indexes that referenced team_id (idempotent via IF EXISTS)
+----------------------------------------------------------------
+DROP INDEX IF EXISTS private_knowledge.idx_pk_docs_unique_scope;  -- old (file_id, COALESCE(team_id, ''))
+DROP INDEX IF EXISTS private_knowledge.idx_pk_docs_team;          -- old WHERE team_id IS NOT NULL
+DROP INDEX IF EXISTS private_knowledge.idx_pk_chunks_team;        -- old WHERE team_id IS NOT NULL
+
+----------------------------------------------------------------
+-- 4) Create org-scoped indexes (idempotent via IF NOT EXISTS)
+----------------------------------------------------------------
+CREATE UNIQUE INDEX IF NOT EXISTS idx_pk_docs_unique_org_file
+    ON private_knowledge.documents(org_slug, file_id);
+
+CREATE INDEX IF NOT EXISTS idx_pk_chunks_org_doc
+    ON private_knowledge.chunks(org_slug, document_id);
+
+----------------------------------------------------------------
+-- 5) UNIQUE (id, org_slug) on documents — FK target for chunks
+----------------------------------------------------------------
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_constraint
+        WHERE  conname  = 'documents_id_org_unique'
+          AND  conrelid = 'private_knowledge.documents'::regclass
+    ) THEN
+        ALTER TABLE private_knowledge.documents
+        ADD CONSTRAINT documents_id_org_unique UNIQUE (id, org_slug);
+    END IF;
+END;
+$$;
+
+----------------------------------------------------------------
+-- 6) Replace single-column chunks FK with composite (idempotent)
+----------------------------------------------------------------
+-- DROP IF EXISTS is safe on re-run; after first success the old name
+-- no longer exists and this is a no-op.
+ALTER TABLE private_knowledge.chunks
+    DROP CONSTRAINT IF EXISTS chunks_document_id_fkey;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_constraint
+        WHERE  conname  = 'chunks_document_id_org_fkey'
+          AND  conrelid = 'private_knowledge.chunks'::regclass
+    ) THEN
+        ALTER TABLE private_knowledge.chunks
+        ADD CONSTRAINT chunks_document_id_org_fkey
+        FOREIGN KEY (document_id, org_slug)
+        REFERENCES  private_knowledge.documents(id, org_slug)
+        ON DELETE CASCADE;
+    END IF;
+END;
+$$;
+
+----------------------------------------------------------------
+-- 7) semantic_cache: add org_slug column + index (both idempotent)
+----------------------------------------------------------------
+ALTER TABLE private_knowledge.semantic_cache
+    ADD COLUMN IF NOT EXISTS org_slug TEXT NOT NULL DEFAULT 'default';
+
+CREATE INDEX IF NOT EXISTS idx_pk_semcache_org_expires
+    ON private_knowledge.semantic_cache(org_slug, expires_at);
+
+-- migrate:down
+-- Intentionally empty: re-allowing NULL org_slug would resurrect the
+-- cross-tenant leak this migration exists to close. Operators needing
+-- to roll back the entire branch should restore from backup.
diff --git a/services/rag/tests/test_background_ingest.py b/services/rag/tests/test_background_ingest.py
index f7357dd15b..05759c6708 100644
--- a/services/rag/tests/test_background_ingest.py
+++ b/services/rag/tests/test_background_ingest.py
@@ -92,7 +92,7 @@ async def test_returns_status_for_found_documents(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_statuses(["doc-1", "doc-2", "doc-3"])
+            result = await service.get_document_statuses(TEST_ORG, ["doc-1", "doc-2", "doc-3"])
 
         assert result["doc-1"]["status"] == "completed"
         assert result["doc-2"]["status"] == "processing"
@@ -121,7 +121,7 @@ async def test_returns_error_field_for_failed_documents(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_statuses(["doc-1"])
+            result = await service.get_document_statuses(TEST_ORG, ["doc-1"])
 
         assert result["doc-1"]["status"] == "failed"
         assert result["doc-1"]["error"] == "Embedding failed"
@@ -136,7 +136,7 @@ async def test_missing_documents_return_none(self):
         mock_conn = _mock_conn(fetch_return=[])
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_statuses(["nonexistent"])
+            result = await service.get_document_statuses(TEST_ORG, ["nonexistent"])
 
         assert result["nonexistent"] is None
 
@@ -148,7 +148,7 @@ async def test_raises_if_pool_is_none(self):
         service._pool = None
 
         with pytest.raises(RuntimeError, match="database pool is None"):
-            await service.get_document_statuses(["doc-1"])
+            await service.get_document_statuses(TEST_ORG, ["doc-1"])
 
 
 # ============================================================================
@@ -232,7 +232,7 @@ async def test_skipped_content_marks_completed(self):
                 "test.txt",
             )
 
-        mock_mark.assert_awaited_once_with("doc-1")
+        mock_mark.assert_awaited_once_with(TEST_ORG, "doc-1")
 
     async def test_non_skipped_does_not_call_mark_completed(self):
         from app.routers.documents import _background_ingest
@@ -276,7 +276,8 @@ async def test_ingestion_failure_records_sanitized_error(self):
             )
 
         mock_fail.assert_awaited_once()
-        error_arg = mock_fail.call_args[0][2]
+        # signature: _record_failure(org_slug, file_id, filename, error)
+        error_arg = mock_fail.call_args[0][3]
         assert len(error_arg) <= 503  # 500 + "..."
 
     async def test_record_failure_error_does_not_propagate(self):
diff --git a/services/rag/tests/test_document_content.py b/services/rag/tests/test_document_content.py
index 9af7fb1ef9..cb18f4d656 100644
--- a/services/rag/tests/test_document_content.py
+++ b/services/rag/tests/test_document_content.py
@@ -14,6 +14,8 @@
 
 pytestmark = pytest.mark.asyncio
 
+TEST_ORG = "test-org"
+
 
 def _make_service():
     """Create a RagService with all internal dependencies pre-mocked."""
@@ -87,7 +89,7 @@ async def test_returns_full_content(self):
         mock_conn = _mock_conn(fetchrow_return=DOC_ROW, fetch_return=CHUNK_ROWS)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1")
+            result = await service.get_document_content(TEST_ORG, "doc-1")
 
         assert result is not None
         assert result["file_id"] == "doc-1"
@@ -107,7 +109,7 @@ async def test_legacy_rows_use_double_newline_join(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1")
+            result = await service.get_document_content(TEST_ORG, "doc-1")
 
         assert result["content"] == "AAA\n\nBBB"
 
@@ -127,7 +129,7 @@ async def test_migrated_rows_use_core_content_join(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1")
+            result = await service.get_document_content(TEST_ORG, "doc-1")
 
         assert result["content"] == "ABCDEF"
 
@@ -136,7 +138,7 @@ async def test_returns_none_for_nonexistent_document(self):
         mock_conn = _mock_conn(fetchrow_return=None)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("nonexistent")
+            result = await service.get_document_content(TEST_ORG, "nonexistent")
 
         assert result is None
 
@@ -149,7 +151,7 @@ async def test_chunk_range_filters_correctly(self):
         mock_conn = _mock_conn(fetchrow_return=DOC_ROW, fetch_return=filtered_chunks)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1", chunk_start=2, chunk_end=3)
+            result = await service.get_document_content(TEST_ORG, "doc-1", chunk_start=2, chunk_end=3)
 
         assert result is not None
         assert result["chunk_range"] == {"start": 2, "end": 3}
@@ -163,7 +165,7 @@ async def test_chunk_start_only_returns_from_start_to_end(self):
         mock_conn = _mock_conn(fetchrow_return=DOC_ROW, fetch_return=chunks)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1", chunk_start=4)
+            result = await service.get_document_content(TEST_ORG, "doc-1", chunk_start=4)
 
         assert result is not None
         assert result["chunk_range"] == {"start": 4, "end": 5}
@@ -173,7 +175,7 @@ async def test_empty_chunks_returns_empty_content(self):
         mock_conn = _mock_conn(fetchrow_return=DOC_ROW, fetch_return=[])
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1", chunk_start=100)
+            result = await service.get_document_content(TEST_ORG, "doc-1", chunk_start=100)
 
         assert result is not None
         assert result["content"] == ""
@@ -187,7 +189,7 @@ async def test_single_chunk_document(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.get_document_content("doc-1")
+            result = await service.get_document_content(TEST_ORG, "doc-1")
 
         assert result is not None
         assert result["content"] == "Only chunk."
@@ -199,10 +201,11 @@ async def test_max_chunk_window_caps_unbounded_request(self):
         mock_conn = _mock_conn(fetchrow_return=DOC_ROW, fetch_return=CHUNK_ROWS)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            await service.get_document_content("doc-1", chunk_start=1)
+            await service.get_document_content(TEST_ORG, "doc-1", chunk_start=1)
 
         fetch_call = mock_conn.fetch.call_args
         sql = fetch_call[0][0]
-        assert "chunk_index <= $3" in sql
-        chunk_end_param = fetch_call[0][3]
+        # org_slug at $1, document_id at $2, chunk_index >= $3, chunk_index <= $4.
+        assert "chunk_index <= $4" in sql
+        chunk_end_param = fetch_call[0][4]
         assert chunk_end_param == service.MAX_CHUNK_WINDOW - 1
diff --git a/services/rag/tests/test_file_dates.py b/services/rag/tests/test_file_dates.py
index da9a63e2a2..69be4f8c19 100644
--- a/services/rag/tests/test_file_dates.py
+++ b/services/rag/tests/test_file_dates.py
@@ -27,6 +27,8 @@
 
 pytestmark = pytest.mark.asyncio
 
+TEST_ORG = "test-org"
+
 
 class TestParsePdfDate:
     """PDF date format ``D:YYYYMMDDHHmmSSOHH'mm'`` parsing."""
@@ -309,6 +311,7 @@ async def test_caller_dates_override_file_dates(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 "doc-1",
                 b"content",
                 "test.pdf",
@@ -318,14 +321,14 @@ async def test_caller_dates_override_file_dates(self):
 
         assert result["success"] is True
 
-        # The UPSERT query has 7 positional args:
-        # $1=file_id, $2=filename, $3=content_hash, $4=chunks_count,
-        # $5=source_created_at, $6=source_modified_at, $7=ocr_applied
+        # _do_store UPSERT now has 8 positional args (org_slug prepended):
+        # $1=org_slug, $2=file_id, $3=filename, $4=content_hash,
+        # $5=chunks_count, $6=source_created_at, $7=source_modified_at, $8=ocr_applied
         # call_args_list[0] = early-dedup check, [1] = UPSERT
         insert_call = mock_conn.fetchrow.call_args_list[1]
         args = insert_call[0]
         # args[0] is the SQL string, positional params start at args[1]
-        source_created_arg = args[5]  # $5
+        source_created_arg = args[6]  # $6
         assert source_created_arg == caller_created
 
 
@@ -368,6 +371,7 @@ async def test_clone_uses_caller_dates_over_source(self):
         with patch("app.services.indexing_service.acquire_with_retry", return_value=ctx):
             result = await _do_clone(
                 pool,
+                TEST_ORG,
                 source_doc_id=42,
                 file_id="clone-test",
                 filename="test.pdf",
@@ -378,13 +382,13 @@ async def test_clone_uses_caller_dates_over_source(self):
 
         assert result is not None
 
-        # INSERT has 6 positional args:
-        # $1=file_id, $2=filename, $3=content_hash, $4=chunks_count,
-        # $5=source_created_at, $6=source_modified_at
+        # _do_clone INSERT now has 7 positional args (org_slug prepended):
+        # $1=org_slug, $2=file_id, $3=filename, $4=content_hash,
+        # $5=chunks_count, $6=source_created_at, $7=source_modified_at
         insert_call = mock_conn.fetchrow.call_args_list[1]
         args = insert_call[0]
-        assert args[5] == caller_created
-        assert args[6] == caller_modified
+        assert args[6] == caller_created
+        assert args[7] == caller_modified
 
     async def test_clone_falls_back_to_source_dates_when_no_override(self):
         from app.services.indexing_service import _do_clone
@@ -420,6 +424,7 @@ async def test_clone_falls_back_to_source_dates_when_no_override(self):
         with patch("app.services.indexing_service.acquire_with_retry", return_value=ctx):
             result = await _do_clone(
                 pool,
+                TEST_ORG,
                 source_doc_id=42,
                 file_id="clone-test",
                 filename="test.pdf",
@@ -430,8 +435,8 @@ async def test_clone_falls_back_to_source_dates_when_no_override(self):
 
         insert_call = mock_conn.fetchrow.call_args_list[1]
         args = insert_call[0]
-        assert args[5] == source_created
-        assert args[6] == source_modified
+        assert args[6] == source_created
+        assert args[7] == source_modified
 
 
 class TestResponseModelDateFields:
diff --git a/services/rag/tests/test_indexing_service.py b/services/rag/tests/test_indexing_service.py
index 0ee289befc..c0bfbe62b3 100644
--- a/services/rag/tests/test_indexing_service.py
+++ b/services/rag/tests/test_indexing_service.py
@@ -89,6 +89,7 @@ def _async_ctx(mock_conn):
 SAMPLE_FILENAME = "test.txt"
 SAMPLE_DOC_ID = "doc-123"
 SAMPLE_HASH = "abcdef1234567890"
+TEST_ORG = "test-org"
 DIFFERENT_HASH = "ffffffffffffffff"
 
 SAMPLE_CHUNKS = [
@@ -124,6 +125,7 @@ async def test_indexes_new_document(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -153,6 +155,7 @@ async def test_embed_texts_called_with_chunk_contents(self):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -178,6 +181,7 @@ async def test_chunk_insert_called_per_chunk(self):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -207,6 +211,7 @@ async def test_passes_vision_client_to_extract(self):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -234,6 +239,7 @@ async def test_custom_chunk_size_and_overlap(self):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -265,6 +271,7 @@ async def test_skips_unchanged_content(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -305,6 +312,7 @@ async def test_reindexes_when_content_changed(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -359,6 +367,7 @@ async def track_execute(sql, *args, **kwargs):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -395,6 +404,7 @@ async def test_replacement_uses_transaction(self):
         ):
             await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -421,6 +431,7 @@ async def test_no_text_extracted_returns_skipped(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -446,6 +457,7 @@ async def test_whitespace_only_text_returns_skipped(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -471,6 +483,7 @@ async def test_no_chunks_produced_returns_skipped(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -493,6 +506,7 @@ async def test_none_text_extracted_returns_skipped(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -524,6 +538,7 @@ async def test_unicode_decode_error_raises_value_error(self):
             with pytest.raises(ValueError, match="Could not decode file"):
                 await index_document(
                     pool,
+                    TEST_ORG,
                     SAMPLE_DOC_ID,
                     SAMPLE_CONTENT,
                     "binary.xyz",
@@ -548,6 +563,7 @@ async def test_unicode_error_message_includes_filename(self):
             with pytest.raises(ValueError, match="my-file.bin"):
                 await index_document(
                     pool,
+                    TEST_ORG,
                     SAMPLE_DOC_ID,
                     SAMPLE_CONTENT,
                     "my-file.bin",
@@ -599,6 +615,7 @@ async def fail_then_succeed(*args, **kwargs):
         ):
             result = await store_prepared_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_FILENAME,
                 prepared,
@@ -641,6 +658,7 @@ async def test_raises_on_second_hnsw_corruption(self):
             with pytest.raises(asyncpg.exceptions.InternalServerError):
                 await store_prepared_document(
                     pool,
+                    TEST_ORG,
                     SAMPLE_DOC_ID,
                     SAMPLE_FILENAME,
                     prepared,
@@ -670,6 +688,7 @@ async def test_non_hnsw_internal_error_not_retried(self):
             with pytest.raises(asyncpg.exceptions.InternalServerError, match="some other internal error"):
                 await store_prepared_document(
                     pool,
+                    TEST_ORG,
                     SAMPLE_DOC_ID,
                     SAMPLE_FILENAME,
                     prepared,
@@ -694,7 +713,7 @@ async def test_find_existing_by_hash_returns_id_when_found(self):
         mock_conn.fetchrow = AsyncMock(return_value={"id": 42})
 
         with _patch_acquire(mock_conn):
-            result = await find_existing_by_hash(pool, SAMPLE_HASH)
+            result = await find_existing_by_hash(pool, TEST_ORG, SAMPLE_HASH)
 
         assert result == 42
 
@@ -705,7 +724,7 @@ async def test_find_existing_by_hash_returns_none_when_not_found(self):
         mock_conn.fetchrow = AsyncMock(return_value=None)
 
         with _patch_acquire(mock_conn):
-            result = await find_existing_by_hash(pool, SAMPLE_HASH)
+            result = await find_existing_by_hash(pool, TEST_ORG, SAMPLE_HASH)
 
         assert result is None
 
@@ -718,6 +737,7 @@ async def test_clone_skips_when_target_has_same_hash(self):
         with _patch_acquire(mock_conn):
             result = await clone_from_existing(
                 pool,
+                TEST_ORG,
                 42,
                 SAMPLE_DOC_ID,
                 SAMPLE_FILENAME,
@@ -744,6 +764,7 @@ async def test_clone_copies_chunks_from_source(self):
         with _patch_acquire(mock_conn):
             result = await clone_from_existing(
                 pool,
+                TEST_ORG,
                 42,
                 SAMPLE_DOC_ID,
                 SAMPLE_FILENAME,
@@ -764,6 +785,7 @@ async def test_clone_returns_none_when_source_vanished(self):
         with _patch_acquire(mock_conn):
             result = await clone_from_existing(
                 pool,
+                TEST_ORG,
                 42,
                 SAMPLE_DOC_ID,
                 SAMPLE_FILENAME,
@@ -802,6 +824,7 @@ async def test_index_document_uses_clone_when_hash_exists(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
@@ -842,6 +865,7 @@ async def test_index_document_falls_back_when_clone_returns_none(self):
         ):
             result = await index_document(
                 pool,
+                TEST_ORG,
                 SAMPLE_DOC_ID,
                 SAMPLE_CONTENT,
                 SAMPLE_FILENAME,
diff --git a/services/rag/tests/test_org_isolation.py b/services/rag/tests/test_org_isolation.py
new file mode 100644
index 0000000000..e35246a52a
--- /dev/null
+++ b/services/rag/tests/test_org_isolation.py
@@ -0,0 +1,208 @@
+"""Cross-org isolation tests for RagService.
+
+The data layer is now per-tenant: `documents` and `chunks` both carry an
+`org_slug` column (NOT NULL DEFAULT 'default'), every SELECT/UPDATE/DELETE
+filters by it, and chunks.org_slug is FK-tied to documents.org_slug.
+
+These tests pin the invariant down at the application layer by verifying
+that the SQL the service issues actually carries `org_slug` and that the
+methods route caller-supplied slugs into the parameter list.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.asyncio
+
+ORG_A = "org-a"
+ORG_B = "org-b"
+
+
+def _async_ctx(mock_conn):
+    ctx = AsyncMock()
+    ctx.__aenter__ = AsyncMock(return_value=mock_conn)
+    ctx.__aexit__ = AsyncMock(return_value=False)
+    return ctx
+
+
+def _mock_conn(*, fetch_return=None, fetchrow_return=None):
+    conn = AsyncMock()
+    conn.fetch = AsyncMock(return_value=fetch_return or [])
+    conn.fetchrow = AsyncMock(return_value=fetchrow_return)
+    conn.execute = AsyncMock()
+
+    tx = AsyncMock()
+    tx.__aenter__ = AsyncMock(return_value=tx)
+    tx.__aexit__ = AsyncMock(return_value=False)
+    conn.transaction = MagicMock(return_value=tx)
+    return conn
+
+
+def _make_service():
+    """Build a RagService with mocked pool — bypass initialize()."""
+    from app.services.rag_service import RagService
+
+    service = RagService()
+    service.initialized = True
+    service._pool = MagicMock()
+    return service
+
+
+class TestSearchScopeClause:
+    """`_build_scope_clause` always emits the org filter."""
+
+    def test_empty_file_ids_still_scopes_by_org(self):
+        from app.services.search_service import RagSearchService
+
+        svc = RagSearchService(MagicMock(), MagicMock())
+        clause, params = svc._build_scope_clause(ORG_A, [], 1)
+        # P0-1 fix: empty file_ids no longer drops the WHERE clause.
+        assert "org_slug" in clause
+        assert params == [ORG_A]
+
+    def test_none_file_ids_still_scopes_by_org(self):
+        from app.services.search_service import RagSearchService
+
+        svc = RagSearchService(MagicMock(), MagicMock())
+        clause, params = svc._build_scope_clause(ORG_B, None, 1)
+        assert "org_slug" in clause
+        assert "file_id" not in clause
+        assert params == [ORG_B]
+
+    def test_non_empty_file_ids_adds_doc_filter_within_org(self):
+        from app.services.search_service import RagSearchService
+
+        svc = RagSearchService(MagicMock(), MagicMock())
+        clause, params = svc._build_scope_clause(ORG_A, ["d1", "d2"], 1)
+        # Inner documents subquery also scoped by org_slug — defense in depth.
+        assert clause.count("org_slug") == 2
+        assert "file_id = ANY" in clause
+        assert params == [ORG_A, ["d1", "d2"]]
+
+
+class TestDeleteDocumentScopedByOrg:
+    """`delete_document` only deletes within `org_slug`."""
+
+    async def test_delete_passes_org_slug_to_select(self):
+        service = _make_service()
+        mock_conn = _mock_conn(fetch_return=[])  # nothing matches → no-op
+        with patch(
+            "app.services.rag_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            await service.delete_document(ORG_A, "doc-1")
+
+        # SELECT query must carry org_slug AND file_id.
+        sql, *params = mock_conn.fetch.call_args[0]
+        assert "org_slug = $1" in sql
+        assert "file_id = $2" in sql
+        assert params == [ORG_A, "doc-1"]
+
+    async def test_delete_no_match_returns_zero(self):
+        """Foreign-org file_id returns 0 deletions, not the other org's data."""
+        service = _make_service()
+        mock_conn = _mock_conn(fetch_return=[])
+
+        with patch(
+            "app.services.rag_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            result = await service.delete_document(ORG_B, "doc-owned-by-org-a")
+
+        assert result["success"] is True
+        assert result["deleted_count"] == 0
+        # No transaction opened when nothing to delete.
+        mock_conn.transaction.assert_not_called()
+
+    async def test_delete_match_scopes_chunks_and_documents_to_org(self):
+        service = _make_service()
+        mock_conn = _mock_conn(fetch_return=[{"id": "uuid-1"}])
+
+        with patch(
+            "app.services.rag_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            await service.delete_document(ORG_A, "doc-1")
+
+        # Both DELETEs must be scoped by org_slug. asyncpg.execute signature:
+        # execute(sql, *args). We assert both calls.
+        execute_calls = mock_conn.execute.call_args_list
+        assert len(execute_calls) == 2  # chunks then documents
+
+        chunks_sql, *chunks_params = execute_calls[0][0]
+        assert "chunks" in chunks_sql
+        assert "org_slug = $1" in chunks_sql
+        assert chunks_params[0] == ORG_A
+
+        docs_sql, *docs_params = execute_calls[1][0]
+        assert "documents" in docs_sql
+        assert "org_slug = $1" in docs_sql
+        assert docs_params[0] == ORG_A
+
+
+class TestGetDocumentContentScopedByOrg:
+    """`get_document_content` returns None for foreign-org documents."""
+
+    async def test_foreign_org_returns_none(self):
+        service = _make_service()
+        # fetchrow returns None — simulates the SQL not matching because
+        # org_slug $1 filters out the foreign-org row.
+        mock_conn = _mock_conn(fetchrow_return=None)
+
+        with patch(
+            "app.services.rag_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            result = await service.get_document_content(ORG_B, "doc-owned-by-org-a")
+
+        assert result is None
+        sql, *params = mock_conn.fetchrow.call_args[0]
+        assert "org_slug = $1" in sql
+        assert params[0] == ORG_B
+
+
+class TestGetDocumentStatusesScopedByOrg:
+    """`get_document_statuses` filters by org."""
+
+    async def test_org_filter_threaded_into_sql(self):
+        service = _make_service()
+        mock_conn = _mock_conn(fetch_return=[])
+
+        with patch(
+            "app.services.rag_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            result = await service.get_document_statuses(ORG_A, ["f1", "f2"])
+
+        # All requested ids resolve to None (foreign-org or unknown).
+        assert result == {"f1": None, "f2": None}
+        sql, *params = mock_conn.fetch.call_args[0]
+        assert "org_slug = $1" in sql
+        assert params[0] == ORG_A
+        assert params[1] == ["f1", "f2"]
+
+
+class TestCrossOrgDedupDisabled:
+    """`find_existing_by_hash` is org-scoped — no cross-org content probing."""
+
+    async def test_same_hash_in_different_org_not_returned(self):
+        from app.services.indexing_service import find_existing_by_hash
+
+        # Mock returning None even when the hash matches — the caller's
+        # org_slug filter is what produces the None.
+        pool = MagicMock()
+        mock_conn = _mock_conn(fetchrow_return=None)
+
+        with patch(
+            "app.services.indexing_service.acquire_with_retry",
+            return_value=_async_ctx(mock_conn),
+        ):
+            result = await find_existing_by_hash(pool, ORG_B, "shared-hash")
+
+        assert result is None
+        sql, *params = mock_conn.fetchrow.call_args[0]
+        assert "org_slug = $1" in sql
+        assert params == [ORG_B, "shared-hash"]
diff --git a/services/rag/tests/test_rag_service.py b/services/rag/tests/test_rag_service.py
index 568d33b8dd..cf7685d55b 100644
--- a/services/rag/tests/test_rag_service.py
+++ b/services/rag/tests/test_rag_service.py
@@ -199,11 +199,15 @@ class TestSearch:
 
     async def test_delegates_to_search_service(self):
         service = _make_service()
+        usage_obj = MagicMock(name="usage")
         service._search_service.search = AsyncMock(
-            return_value=[
-                {"content": "hit 1", "score": 0.9, "file_id": "doc-1"},
-                {"content": "hit 2", "score": 0.8, "file_id": "doc-2"},
-            ]
+            return_value=(
+                [
+                    {"content": "hit 1", "score": 0.9, "file_id": "doc-1"},
+                    {"content": "hit 2", "score": 0.8, "file_id": "doc-2"},
+                ],
+                usage_obj,
+            )
         )
 
         with patch("app.services.rag_service.settings") as mock_settings:
@@ -212,11 +216,11 @@ async def test_delegates_to_search_service(self):
             results, usage = await service.search(TEST_ORG, "test query", file_ids=["doc-1"])
 
         assert len(results) == 2
-        # `search` returns a (results, usage) tuple now — the usage
-        # value is the per-call embedding usage attached to the search
-        # service, not a shared singleton.
-        assert usage is service._search_service.last_search_usage
+        # `search_service.search` now returns the `(results, usage)`
+        # tuple directly — no shared singleton attribute to read.
+        assert usage is usage_obj
         service._search_service.search.assert_awaited_once_with(
+            TEST_ORG,
             "test query",
             file_ids=["doc-1"],
             top_k=10,
@@ -225,7 +229,7 @@ async def test_delegates_to_search_service(self):
 
     async def test_applies_similarity_threshold(self):
         service = _make_service()
-        service._search_service.search = AsyncMock(return_value=[])
+        service._search_service.search = AsyncMock(return_value=([], MagicMock()))
 
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 10
@@ -234,6 +238,7 @@ async def test_applies_similarity_threshold(self):
 
         # Threshold is now passed to search_service for vector pre-filtering
         service._search_service.search.assert_awaited_once_with(
+            TEST_ORG,
             "query",
             file_ids=None,
             top_k=10,
@@ -242,7 +247,7 @@ async def test_applies_similarity_threshold(self):
 
     async def test_custom_top_k_overrides_settings(self):
         service = _make_service()
-        service._search_service.search = AsyncMock(return_value=[])
+        service._search_service.search = AsyncMock(return_value=([], MagicMock()))
 
         with patch("app.services.rag_service.settings") as mock_settings:
             mock_settings.top_k = 5
@@ -250,6 +255,7 @@ async def test_custom_top_k_overrides_settings(self):
             await service.search(TEST_ORG, "query", top_k=20)
 
         service._search_service.search.assert_awaited_once_with(
+            TEST_ORG,
             "query",
             file_ids=None,
             top_k=20,
@@ -259,9 +265,10 @@ async def test_custom_top_k_overrides_settings(self):
     async def test_custom_threshold_overrides_settings(self):
         service = _make_service()
         service._search_service.search = AsyncMock(
-            return_value=[
-                {"content": "mid", "score": 0.5, "file_id": "d1"},
-            ]
+            return_value=(
+                [{"content": "mid", "score": 0.5, "file_id": "d1"}],
+                MagicMock(),
+            )
         )
 
         with patch("app.services.rag_service.settings") as mock_settings:
@@ -274,9 +281,10 @@ async def test_custom_threshold_overrides_settings(self):
     async def test_zero_threshold_returns_all(self):
         service = _make_service()
         service._search_service.search = AsyncMock(
-            return_value=[
-                {"content": "a", "score": 0.01, "file_id": "d1"},
-            ]
+            return_value=(
+                [{"content": "a", "score": 0.01, "file_id": "d1"}],
+                MagicMock(),
+            )
         )
 
         with patch("app.services.rag_service.settings") as mock_settings:
@@ -288,7 +296,7 @@ async def test_zero_threshold_returns_all(self):
 
     async def test_passes_file_ids(self):
         service = _make_service()
-        service._search_service.search = AsyncMock(return_value=[])
+        service._search_service.search = AsyncMock(return_value=([], MagicMock()))
         service.get_document_statuses = AsyncMock(return_value={"doc-1": None, "doc-2": None})
 
         with patch("app.services.rag_service.settings") as mock_settings:
@@ -297,6 +305,7 @@ async def test_passes_file_ids(self):
             await service.search(TEST_ORG, "q", file_ids=["doc-1", "doc-2"])
 
         service._search_service.search.assert_awaited_once_with(
+            TEST_ORG,
             "q",
             file_ids=["doc-1", "doc-2"],
             top_k=10,
@@ -498,7 +507,7 @@ async def test_deletes_document(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.delete_document("doc-1")
+            result = await service.delete_document(TEST_ORG, "doc-1")
 
         assert result["success"] is True
         assert result["deleted_count"] == 1
@@ -514,7 +523,7 @@ async def test_deletes_multiple_matching_docs(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.delete_document("doc-1")
+            result = await service.delete_document(TEST_ORG, "doc-1")
 
         assert result["deleted_count"] == 2
 
@@ -523,7 +532,7 @@ async def test_no_documents_found_returns_zero_deleted(self):
         mock_conn = _mock_conn(fetch_return=[])
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.delete_document("nonexistent")
+            result = await service.delete_document(TEST_ORG, "nonexistent")
 
         assert result["success"] is True
         assert result["deleted_count"] == 0
@@ -551,7 +560,7 @@ async def track_execute(sql, *args, **kwargs):
         mock_conn.execute = AsyncMock(side_effect=track_execute)
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            await service.delete_document("doc-1")
+            await service.delete_document(TEST_ORG, "doc-1")
 
         assert call_order == ["delete_chunks", "delete_documents"]
 
@@ -565,7 +574,7 @@ async def test_delete_uses_transaction(self):
         )
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            await service.delete_document("doc-1")
+            await service.delete_document(TEST_ORG, "doc-1")
 
         mock_conn.transaction.assert_called_once()
 
@@ -574,7 +583,7 @@ async def test_processing_time_is_reported(self):
         mock_conn = _mock_conn(fetch_return=[])
 
         with patch("app.services.rag_service.acquire_with_retry", return_value=_async_ctx(mock_conn)):
-            result = await service.delete_document("doc-1")
+            result = await service.delete_document(TEST_ORG, "doc-1")
 
         assert "processing_time_ms" in result
         assert result["processing_time_ms"] >= 0
diff --git a/services/rag/tests/test_search_service.py b/services/rag/tests/test_search_service.py
index f237d2ea91..f58a635888 100644
--- a/services/rag/tests/test_search_service.py
+++ b/services/rag/tests/test_search_service.py
@@ -21,6 +21,8 @@
 
 pytestmark = pytest.mark.asyncio
 
+TEST_ORG = "test-org"
+
 
 def _make_row(
     row_id: int, chunk_content: str, file_id: str, score: float = 1.0, chunk_index: int = 0
@@ -115,7 +117,7 @@ async def side_effect_fn(*_args, **_kwargs):
             service._fts_search = AsyncMock(return_value=fts_rows)
             service._vector_search = AsyncMock(return_value=vector_rows)
 
-            results = await service.search("test query", file_ids=["doc-1"])
+            results, _usage = await service.search(TEST_ORG, "test query", file_ids=["doc-1"])
 
         assert len(results) > 0
         for r in results:
@@ -129,7 +131,7 @@ async def test_returns_empty_when_both_channels_empty(self):
         service._fts_search = AsyncMock(return_value=[])
         service._vector_search = AsyncMock(return_value=[])
 
-        results = await service.search("nothing here")
+        results, _usage = await service.search(TEST_ORG, "nothing here")
 
         assert results == []
 
@@ -143,7 +145,7 @@ async def test_fts_only_results(self):
         service._fts_search = AsyncMock(return_value=fts_rows)
         service._vector_search = AsyncMock(return_value=[])
 
-        results = await service.search("fts query")
+        results, _usage = await service.search(TEST_ORG, "fts query")
 
         assert len(results) == 2
         assert results[0]["content"] == "Only FTS hit"
@@ -155,7 +157,7 @@ async def test_vector_only_results(self):
         service._fts_search = AsyncMock(return_value=[])
         service._vector_search = AsyncMock(return_value=vector_rows)
 
-        results = await service.search("vector query")
+        results, _usage = await service.search(TEST_ORG, "vector query")
 
         assert len(results) == 1
         assert results[0]["content"] == "Vector hit"
@@ -168,7 +170,7 @@ async def test_top_k_limits_results(self):
         service._fts_search = AsyncMock(return_value=fts_rows)
         service._vector_search = AsyncMock(return_value=vector_rows)
 
-        results = await service.search("query", top_k=5)
+        results, _usage = await service.search(TEST_ORG, "query", top_k=5)
 
         assert len(results) <= 5
 
@@ -177,7 +179,7 @@ async def test_embedding_service_called_with_query(self):
         service._fts_search = AsyncMock(return_value=[])
         service._vector_search = AsyncMock(return_value=[])
 
-        await service.search("my search query")
+        await service.search(TEST_ORG, "my search query")
 
         embed_svc.embed_query_with_usage.assert_awaited_once_with("my search query")
 
@@ -192,11 +194,14 @@ def test_build_scope_clause_with_file_ids(self):
         embed = MagicMock()
         service = RagSearchService(pool, embed)
 
-        clause, params = service._build_scope_clause(["doc-a", "doc-b"], 1)
+        clause, params = service._build_scope_clause(TEST_ORG, ["doc-a", "doc-b"], 1)
 
+        # org filter is ALWAYS present; file_id filter is additive.
+        assert "org_slug" in clause
+        assert "$2" in clause  # org param at offset+1
         assert "file_id" in clause
-        assert "ANY($2)" in clause
-        assert params == [["doc-a", "doc-b"]]
+        assert "ANY($3)" in clause  # file_ids at offset+2
+        assert params == [TEST_ORG, ["doc-a", "doc-b"]]
 
     def test_build_scope_clause_without_file_ids(self):
         from app.services.search_service import RagSearchService
@@ -205,10 +210,12 @@ def test_build_scope_clause_without_file_ids(self):
         embed = MagicMock()
         service = RagSearchService(pool, embed)
 
-        clause, params = service._build_scope_clause(None, 1)
+        clause, params = service._build_scope_clause(TEST_ORG, None, 1)
 
-        assert clause == ""
-        assert params == []
+        # Empty/None file_ids now produces an org-only filter (not "").
+        assert "org_slug" in clause
+        assert "file_id" not in clause
+        assert params == [TEST_ORG]
 
     def test_build_scope_clause_respects_param_offset(self):
         from app.services.search_service import RagSearchService
@@ -217,24 +224,30 @@ def test_build_scope_clause_respects_param_offset(self):
         embed = MagicMock()
         service = RagSearchService(pool, embed)
 
-        clause, params = service._build_scope_clause(["doc-a"], 3)
+        clause, params = service._build_scope_clause(TEST_ORG, ["doc-a"], 3)
 
+        # org param at offset+1 = $4, file_ids at offset+2 = $5.
         assert "$4" in clause
+        assert "$5" in clause
 
     async def test_search_passes_file_ids_to_fts_and_vector(self):
         service, *_ = _build_service()
         service._fts_search = AsyncMock(return_value=[])
         service._vector_search = AsyncMock(return_value=[])
 
-        await service.search("query", file_ids=["doc-1", "doc-2"])
+        await service.search(TEST_ORG, "query", file_ids=["doc-1", "doc-2"])
 
+        # _fts_search signature is now (query, org_slug, file_ids, limit)
         service._fts_search.assert_awaited_once()
         fts_args = service._fts_search.call_args
-        assert fts_args[0][1] == ["doc-1", "doc-2"]
+        assert fts_args[0][1] == TEST_ORG
+        assert fts_args[0][2] == ["doc-1", "doc-2"]
 
+        # _vector_search signature is (embedding, org_slug, file_ids, limit)
         service._vector_search.assert_awaited_once()
         vec_args = service._vector_search.call_args
-        assert vec_args[0][1] == ["doc-1", "doc-2"]
+        assert vec_args[0][1] == TEST_ORG
+        assert vec_args[0][2] == ["doc-1", "doc-2"]
 
 
 class TestGracefulFallback:
@@ -250,7 +263,7 @@ async def test_undefined_table_returns_empty(self):
         with patch.object(service, "_fts_search", side_effect=asyncpg.UndefinedTableError("no table")):
             with patch.object(service, "_vector_search", return_value=[]):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
-                    results = await service.search("query")
+                    results, _usage = await service.search(TEST_ORG, "query")
 
         assert results == []
 
@@ -260,7 +273,7 @@ async def test_undefined_column_returns_empty(self):
         with patch.object(service, "_fts_search", side_effect=asyncpg.UndefinedColumnError("column missing")):
             with patch.object(service, "_vector_search", return_value=[]):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
-                    results = await service.search("query")
+                    results, _usage = await service.search(TEST_ORG, "query")
 
         assert results == []
 
@@ -278,7 +291,7 @@ async def raise_bm25(*args, **kwargs):
         with patch.object(service, "_fts_search", side_effect=raise_bm25):
             with patch.object(service, "_vector_search", return_value=vector_rows):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
-                    results = await service.search("query")
+                    results, _usage = await service.search(TEST_ORG, "query")
 
         assert len(results) == 2
         assert results[0]["content"] == "vec result 1"
@@ -293,7 +306,7 @@ async def test_non_bm25_exception_propagates(self):
             with patch.object(service, "_vector_search", return_value=[]):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
                     with pytest.raises(RuntimeError, match="unexpected db error"):
-                        await service.search("query")
+                        await service.search(TEST_ORG, "query")
 
 
 class TestDataCorruptionRecovery:
@@ -307,7 +320,7 @@ async def test_data_corrupted_error_falls_back_to_vector_only(self):
             with patch.object(service, "_vector_search", return_value=vector_rows):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
                     with patch.object(service, "_rebuild_bm25_index", new_callable=AsyncMock):
-                        results = await service.search("query")
+                        results, _usage = await service.search(TEST_ORG, "query")
 
         assert len(results) == 1
         assert results[0]["content"] == "vec result"
@@ -322,7 +335,7 @@ async def test_data_corrupted_error_triggers_rebuild(self):
             with patch.object(service, "_vector_search", return_value=[]):
                 with patch.object(service._embedding, "embed_query", return_value=[0.1]):
                     with patch.object(service, "_rebuild_bm25_index", new_callable=AsyncMock) as mock_rebuild:
-                        await service.search("query")
+                        await service.search(TEST_ORG, "query")
                         await _asyncio.sleep(0)
 
         mock_rebuild.assert_awaited_once()
@@ -342,7 +355,7 @@ async def test_fts_data_corrupted_error_returns_empty(self):
             mock_acq.return_value.__aenter__ = AsyncMock(return_value=mock_conn)
             mock_acq.return_value.__aexit__ = AsyncMock(return_value=False)
 
-            results = await service._fts_search("query", None, 10)
+            results = await service._fts_search("query", TEST_ORG, None, 10)
 
         assert results == []
 
@@ -404,7 +417,7 @@ async def test_fts_bm25_failure_returns_empty(self):
             mock_acq.return_value.__aenter__ = AsyncMock(return_value=mock_conn)
             mock_acq.return_value.__aexit__ = AsyncMock(return_value=False)
 
-            results = await service._fts_search("query", None, 10)
+            results = await service._fts_search("query", TEST_ORG, None, 10)
 
         assert results == []
 
@@ -423,7 +436,7 @@ async def test_fts_internal_server_error_returns_empty(self):
             mock_acq.return_value.__aenter__ = AsyncMock(return_value=mock_conn)
             mock_acq.return_value.__aexit__ = AsyncMock(return_value=False)
 
-            results = await service._fts_search("query", ["doc-1"], 10)
+            results = await service._fts_search("query", TEST_ORG, ["doc-1"], 10)
 
         assert results == []
 
@@ -443,7 +456,7 @@ async def test_fts_non_db_error_propagates(self):
             mock_acq.return_value.__aexit__ = AsyncMock(return_value=False)
 
             with pytest.raises(RuntimeError, match="connection refused"):
-                await service._fts_search("query", None, 10)
+                await service._fts_search("query", TEST_ORG, None, 10)
 
 
 class TestApplyRecencyBoost:
@@ -580,7 +593,7 @@ async def test_recency_boost_applied_when_enabled(self):
             mock_settings.vector_quality_threshold = 0
             mock_settings.reranking_enabled = False
 
-            results = await service.search("query")
+            results, _usage = await service.search(TEST_ORG, "query")
 
         assert len(results) == 2
         new_doc = next(r for r in results if r["file_id"] == "doc-2")
@@ -602,7 +615,7 @@ async def test_recency_boost_skipped_when_disabled(self):
             mock_settings.reranking_enabled = False
 
             with patch("app.services.search_service._apply_recency_boost") as mock_boost:
-                results = await service.search("query")
+                results, _usage = await service.search(TEST_ORG, "query")
 
                 mock_boost.assert_not_called()
 
diff --git a/services/rag/tests/test_semantic_cache_isolation.py b/services/rag/tests/test_semantic_cache_isolation.py
new file mode 100644
index 0000000000..4a9f53ccb5
--- /dev/null
+++ b/services/rag/tests/test_semantic_cache_isolation.py
@@ -0,0 +1,145 @@
+"""Cross-org isolation tests for SemanticCache.
+
+The semantic cache is shared at the table level (single Postgres table)
+but every SELECT/INSERT/DELETE filters by `org_slug`. Two orgs with
+semantically identical queries get independent cache entries.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.asyncio
+
+ORG_A = "org-a"
+ORG_B = "org-b"
+
+
+def _async_ctx(conn):
+    ctx = AsyncMock()
+    ctx.__aenter__ = AsyncMock(return_value=conn)
+    ctx.__aexit__ = AsyncMock(return_value=False)
+    return ctx
+
+
+class TestLookupOrgScoped:
+    async def test_lookup_threads_org_into_sql(self):
+        from app.services.semantic_cache import SemanticCache
+
+        conn = AsyncMock()
+        conn.fetchrow = AsyncMock(return_value=None)
+        conn.execute = AsyncMock()
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            result = await cache.lookup(ORG_A, [0.1, 0.2, 0.3])
+
+        assert result is None
+        sql, *params = conn.fetchrow.call_args[0]
+        # WHERE org_slug = $2 (after $1 vec_str).
+        assert "org_slug = $2" in sql
+        assert params[1] == ORG_A
+
+    async def test_lookup_other_org_does_not_match(self):
+        """Even when a cached row exists for ORG_A, lookups by ORG_B miss."""
+        from app.services.semantic_cache import SemanticCache
+
+        # Simulate the SQL returning None — that's exactly what org filtering
+        # produces against the foreign-org row at the DB layer.
+        conn = AsyncMock()
+        conn.fetchrow = AsyncMock(return_value=None)
+        conn.execute = AsyncMock()
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            entry = await cache.lookup(ORG_B, [0.1, 0.2, 0.3])
+
+        assert entry is None
+
+
+class TestStoreOrgScoped:
+    async def test_store_writes_org_slug(self):
+        from app.services.semantic_cache import SemanticCache
+
+        conn = AsyncMock()
+        conn.execute = AsyncMock()
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            await cache.store(ORG_A, "q", [0.1], "response", file_ids=["doc-1"])
+
+        sql, *params = conn.execute.call_args[0]
+        assert "org_slug" in sql
+        # The INSERT positional list starts with org_slug at $1.
+        assert params[0] == ORG_A
+
+
+class TestInvalidateOrgScoped:
+    async def test_invalidate_scoped_to_org(self):
+        from app.services.semantic_cache import SemanticCache
+
+        conn = AsyncMock()
+        conn.execute = AsyncMock(return_value="DELETE 2")
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            count = await cache.invalidate(ORG_A, ["doc-x"])
+
+        assert count == 2
+        sql, *params = conn.execute.call_args[0]
+        assert "org_slug = $1" in sql
+        assert params[0] == ORG_A
+        assert params[1] == ["doc-x"]
+
+
+class TestCleanup:
+    async def test_cleanup_with_org_scoped(self):
+        from app.services.semantic_cache import SemanticCache
+
+        conn = AsyncMock()
+        conn.execute = AsyncMock(return_value="DELETE 3")
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            count = await cache.cleanup(ORG_A)
+
+        assert count == 3
+        sql, *params = conn.execute.call_args[0]
+        assert "org_slug = $1" in sql
+        assert params[0] == ORG_A
+
+    async def test_cleanup_with_no_org_runs_global(self):
+        """Passing None explicitly runs a global GC — operator path only."""
+        from app.services.semantic_cache import SemanticCache
+
+        conn = AsyncMock()
+        conn.execute = AsyncMock(return_value="DELETE 10")
+
+        cache = SemanticCache(MagicMock())
+        with patch(
+            "app.services.semantic_cache.acquire_with_retry",
+            return_value=_async_ctx(conn),
+        ):
+            count = await cache.cleanup(None)
+
+        assert count == 10
+        sql, *_params = conn.execute.call_args[0]
+        # Global cleanup omits the org filter on purpose.
+        assert "org_slug" not in sql

From ee5a6d14330302c1a6051202ac0cb21152bdb8ac Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:04:38 +0800
Subject: [PATCH 08/41] fix(cli): inline migrate-config-layout script so it
 survives bun --compile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P0-3 from the multi-agent review. The compiled binary at
tools/cli/dist/tale ENOENT'd on `tale migrate config-layout` because
the script was loaded via `readFile(import.meta.url + '../migrate-
config-layout/script.sh')`. Bun's `--compile` does not bundle runtime
asset reads — only assets imported with `with { type: 'file' }` or
listed in `Bun.build({entrypoints})`. From source it works; from the
shipped binary the entire upgrade runbook was broken.

Fix mirrors the canonical pattern at reseed-all-orgs.ts:58-71
(RESEED_SCRIPT inline template literal). Bash `${VAR}` collides with
TS template-literal `${...}` interpolation, so each literal `${` in
the script is escaped as `\${`; plain `$VAR` (without braces) stays
unchanged. Removed the now-orphaned tools/cli/src/lib/migrate-config-
layout/script.sh and its parent directory — keeping the .sh on disk
would invite a future contributor to revive `loadScript()`.

Regression guard: new tools/cli/scripts/check-bundle.ts greps the
compiled binary for distinctive markers (`MIGRATE_PLAN`,
`MIGRATE_SUMMARY`, the reseed convex function ref) and exits non-zero
if any are missing. Wired into `bun run build` and `build:windows` as
a post-compile step so CI catches a recurrence of this regression
class loud and early.

Verified: `bun run build` → `check-bundle: OK (3 markers present)`;
compiled binary's `tale migrate config-layout --dry-run` now reaches
the project-validation phase instead of ENOENTing on the script load.
---
 tools/cli/package.json                        |   5 +-
 tools/cli/scripts/check-bundle.ts             |  78 ++++++
 .../src/lib/actions/migrate-config-layout.ts  | 250 ++++++++++++++++--
 .../src/lib/migrate-config-layout/script.sh   | 210 ---------------
 4 files changed, 311 insertions(+), 232 deletions(-)
 create mode 100644 tools/cli/scripts/check-bundle.ts
 delete mode 100644 tools/cli/src/lib/migrate-config-layout/script.sh

diff --git a/tools/cli/package.json b/tools/cli/package.json
index 78e49c31b0..cc2a40e376 100644
--- a/tools/cli/package.json
+++ b/tools/cli/package.json
@@ -5,10 +5,11 @@
   "scripts": {
     "generate": "bun run scripts/generate-embedded.ts",
     "setup": "bun run generate",
-    "build": "bun run generate && bun run build:linux && bun run build:mac",
+    "build": "bun run generate && bun run build:linux && bun run build:mac && bun run check:bundle",
     "build:linux": "bun build --compile --target=bun-linux-x64 --outfile=dist/tale src/index.ts",
     "build:mac": "bun build --compile --target=bun-darwin-arm64 --outfile=dist/tale src/index.ts",
-    "build:windows": "bun run generate && bun build --compile --target=bun-windows-x64 --outfile=dist/tale.exe src/index.ts",
+    "build:windows": "bun run generate && bun build --compile --target=bun-windows-x64 --outfile=dist/tale.exe src/index.ts && bun run scripts/check-bundle.ts dist/tale.exe",
+    "check:bundle": "bun run scripts/check-bundle.ts dist/tale",
     "lint": "bunx oxlint",
     "lint:fix": "bunx oxlint --fix",
     "typecheck": "bunx tsc --noEmit"
diff --git a/tools/cli/scripts/check-bundle.ts b/tools/cli/scripts/check-bundle.ts
new file mode 100644
index 0000000000..3d847da477
--- /dev/null
+++ b/tools/cli/scripts/check-bundle.ts
@@ -0,0 +1,78 @@
+/**
+ * Post-build assertion: greps the compiled binary for distinctive markers
+ * that MUST be embedded for runtime correctness. Catches the regression
+ * class where a `fs.readFile(...)`-from-`import.meta.url` pattern slips
+ * past local-source testing but ENOENTs from the shipped binary.
+ *
+ * History: `tale migrate config-layout` shipped broken in the binary
+ * because the migrate script was loaded at runtime via `readFile` instead
+ * of inlined as a TS template literal. `bun --compile` does not bundle
+ * runtime fs reads. This check makes a recurrence loud.
+ *
+ * Run: bun run scripts/check-bundle.ts dist/tale[.exe]
+ */
+
+import { existsSync, readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+// Distinctive substrings that must be embedded in the compiled binary.
+// Pair each substring with the action whose script contains it so
+// the failure message points the operator at the right place.
+const REQUIRED_MARKERS: ReadonlyArray<readonly [string, string]> = [
+  ['MIGRATE_PLAN', 'migrate-config-layout.ts (MIGRATE_SCRIPT)'],
+  ['MIGRATE_SUMMARY', 'migrate-config-layout.ts (MIGRATE_SCRIPT)'],
+  // reseed-all-orgs.ts already inlines its bash via RESEED_SCRIPT; pin it
+  // so a future refactor that splits it back out also fails this check.
+  [
+    'reseed_all_orgs:reseedAllOrgsFromBuiltin',
+    'reseed-all-orgs.ts (RESEED_SCRIPT)',
+  ],
+];
+
+function main(): void {
+  const binaryArg = process.argv[2];
+  if (!binaryArg) {
+    console.error('usage: bun run scripts/check-bundle.ts <path/to/dist/tale>');
+    process.exit(2);
+  }
+
+  const binaryPath = resolve(process.cwd(), binaryArg);
+  if (!existsSync(binaryPath)) {
+    console.error(`check-bundle: binary not found at ${binaryPath}`);
+    process.exit(2);
+  }
+
+  // Read as raw bytes and decode latin1: every byte maps 1:1 to a code
+  // point so substring search works regardless of the binary's true
+  // encoding. Strings embedded as UTF-8 (the default) survive this view.
+  const bytes = readFileSync(binaryPath);
+  const text = bytes.toString('latin1');
+
+  const missing: Array<[string, string]> = [];
+  for (const [marker, owner] of REQUIRED_MARKERS) {
+    if (!text.includes(marker)) {
+      missing.push([marker, owner]);
+    }
+  }
+
+  if (missing.length === 0) {
+    console.log(
+      `check-bundle: OK (${REQUIRED_MARKERS.length} markers present in ${binaryPath})`,
+    );
+    return;
+  }
+
+  console.error('check-bundle: FAILED — missing required markers:');
+  for (const [marker, owner] of missing) {
+    console.error(`  - ${JSON.stringify(marker)}  (owner: ${owner})`);
+  }
+  console.error(
+    '\nThe likely cause is a runtime `fs.readFile(import.meta.url + ...)` ' +
+      'or similar — Bun --compile does not bundle runtime asset reads. ' +
+      'Inline the asset as a TS template literal (see reseed-all-orgs.ts ' +
+      'RESEED_SCRIPT and migrate-config-layout.ts MIGRATE_SCRIPT).',
+  );
+  process.exit(1);
+}
+
+main();
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
index 37def9547a..fe5d02de64 100644
--- a/tools/cli/src/lib/actions/migrate-config-layout.ts
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -17,10 +17,6 @@
  *      (sha-verifies new == old, then unlinks the olds)
  */
 
-import { readFile } from 'node:fs/promises';
-import { dirname, join } from 'node:path';
-import { fileURLToPath } from 'node:url';
-
 import { getProjectId } from '../../utils/load-env';
 import * as logger from '../../utils/logger';
 import { exec } from '../docker/exec';
@@ -32,22 +28,238 @@ interface MigrateConfigLayoutOptions {
 }
 
 /**
- * Read the migrate script next to this module. The .sh file is the source
- * of truth (also runnable in the shell-script integration harness), and
- * Bun's source-file colocation makes runtime loading work in both `bun
- * run`-from-source and the compiled binary (Bun bundles imported assets).
+ * The migrate script is inlined here so it survives `bun build --compile`.
+ *
+ * Earlier this lived at `tools/cli/src/lib/migrate-config-layout/script.sh`
+ * and was read at runtime via `readFile(fileURLToPath(import.meta.url) +
+ * '../migrate-config-layout/script.sh')`. That works under `bun run` from
+ * source but Bun's `--compile` does NOT bundle runtime `fs.readFile` reads
+ * — only assets imported with `with { type: 'file' }` or explicitly listed
+ * in `Bun.build({entrypoints: [...]})`. The compiled binary then ENOENTed
+ * at `/$bunfs/migrate-config-layout/script.sh`, breaking the entire
+ * upgrade runbook for any operator running a release binary.
+ *
+ * Embedding as a TS template literal is the canonical pattern used by
+ * sibling `reseed-all-orgs.ts:58-71` — bundle-safe by construction. The
+ * `scripts/check-bundle.ts` post-build assertion greps for the
+ * `MIGRATE_PLAN` marker so this regression cannot silently recur.
+ *
+ * Bash uses `${VAR}` for parameter expansion which collides with TS
+ * template-literal `${...}` interpolation. Every literal `${` in the
+ * script must be escaped as `\${`.
  */
-async function loadScript(): Promise<string> {
-  const moduleDir = dirname(fileURLToPath(import.meta.url));
-  const scriptPath = join(
-    moduleDir,
-    '..',
-    'migrate-config-layout',
-    'script.sh',
-  );
-  return await readFile(scriptPath, 'utf-8');
+const MIGRATE_SCRIPT = `#!/bin/bash
+# Migrate providers/*.secrets.json from old per-domain layout to new
+# org-first layout. Idempotent. Uses cp (not mv) so old paths remain
+# readable until the operator runs \`tale migrate config-layout --cleanup-old\`.
+#
+# Old → new mapping:
+#   $DATA/providers/<name>.secrets.json
+#     → $DATA/default/providers/<name>.secrets.json
+#   $DATA/providers/<org>/<name>.secrets.json
+#     → $DATA/<org>/providers/<name>.secrets.json
+#
+# Scope: providers/*.secrets.json ONLY. Non-secret config is reseeded by
+# \`tale deploy --override-all\` against the builtin catalog; non-provider
+# .history/ trails under old paths are intentionally abandoned (the user's
+# "secrets only" runbook trade-off).
+#
+# Designed to run against the CURRENTLY-running convex container (old
+# image, old code paths still active). cp leaves old paths in place so
+# old code keeps reading providers correctly until the operator runs
+# \`tale deploy --override-all -y\` to recreate convex with the new code.
+set -euo pipefail
+
+DRY_RUN=0
+CLEANUP_OLD=0
+for arg in "$@"; do
+  case "$arg" in
+    --dry-run) DRY_RUN=1 ;;
+    --cleanup-old) CLEANUP_OLD=1 ;;
+    *) echo "Unknown arg: $arg" >&2; exit 2 ;;
+  esac
+done
+
+# Defense in depth: \`set -u\` already aborts on unset $DATA, but \${VAR:?…}
+# gives a clearer error message and won't be defeated by a future \`set
+# +u\` somewhere downstream. Critical because some branches below build
+# absolute paths from $DATA and rm them — a silent empty would operate
+# from the container's filesystem root.
+DATA="\${TALE_CONFIG_DIR:-/app/data}"
+: "\${DATA:?DATA must be a non-empty absolute path}"
+APP_UID=1001
+APP_GID=1001
+
+planned=0
+copied=0
+skipped=0
+removed=0
+errors=0
+conflicts=()
+
+# Move a single .secrets.json from old to new path. cp -a preserves mode +
+# ownership (encrypted secrets are 0600 owner:app). Idempotent: if the
+# destination already exists, verify byte-for-byte equality (then skip)
+# rather than overwriting — protects a concurrent UI-side \`atomicWriteSecret\`
+# that landed at the new path between this script's check and its copy.
+copy_secret() {
+  local src="$1" dst="$2"
+  local dst_dir; dst_dir="$(dirname "$dst")"
+  if [ -e "$dst" ]; then
+    if cmp -s "$src" "$dst" 2>/dev/null; then
+      # SKIP belongs to stdout (informational, expected on re-run);
+      # only true ERROR lines go to stderr so the CLI wrapper can
+      # distinguish noise from real failures.
+      skipped=$((skipped+1)); echo "SKIP (already migrated): $src"
+      return 0
+    else
+      conflicts+=("$src != $dst")
+      errors=$((errors+1))
+      echo "ERROR: $dst exists but differs from $src; refusing to overwrite" >&2
+      return 0
+    fi
+  fi
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_PLAN: mkdir -p $dst_dir && cp -a $src $dst"
+    planned=$((planned+1))
+    return 0
+  fi
+  mkdir -p "$dst_dir"
+  chown "$APP_UID:$APP_GID" "$dst_dir" 2>/dev/null || true
+  cp -a "$src" "$dst"
+  copied=$((copied+1))
+  echo "OK: $src -> $dst"
+}
+
+# Remove an old-path secret IF the new-path copy exists and matches
+# byte-for-byte. Refuses any mismatch — operator must reconcile manually.
+remove_old_secret() {
+  local old="$1" new="$2"
+  if [ ! -e "$old" ]; then return 0; fi
+  if [ ! -e "$new" ]; then
+    conflicts+=("missing new-path counterpart for $old (expected $new)")
+    errors=$((errors+1))
+    echo "ERROR: $new does not exist; refusing to remove $old" >&2
+    return 0
+  fi
+  if ! cmp -s "$old" "$new" 2>/dev/null; then
+    conflicts+=("$old != $new")
+    errors=$((errors+1))
+    echo "ERROR: $old and $new differ; refusing to remove $old" >&2
+    return 0
+  fi
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "CLEANUP_PLAN: rm $old"
+    planned=$((planned+1))
+    return 0
+  fi
+  rm -f "$old"
+  removed=$((removed+1))
+  echo "REMOVED: $old"
 }
 
+# ---------------------------------------------------------------------------
+# Enumeration
+# ---------------------------------------------------------------------------
+
+# Pre-scan: flag when both the flat path (providers/foo.secrets.json)
+# and the nested path (providers/default/foo.secrets.json) would map to
+# the same destination. Without this, copy_secret's per-pair cmp -s
+# would surface only one of the two as an error, leaving the operator
+# guessing which source was the "real" one.
+detect_default_dst_collisions() {
+  [ -d "$DATA/providers/default" ] || return 0
+  for f in "$DATA"/providers/*.secrets.json; do
+    [ -f "$f" ] || continue
+    local base nested
+    base="$(basename "$f")"
+    nested="$DATA/providers/default/$base"
+    if [ -f "$nested" ]; then
+      conflicts+=("dst collision: $f and $nested both map to $DATA/default/providers/$base")
+      errors=$((errors+1))
+      echo "ERROR: $f and $nested both target $DATA/default/providers/$base; manual reconcile required" >&2
+    fi
+  done
+}
+
+process_secret() {
+  local src="$1" dst="$2"
+  if [ "$CLEANUP_OLD" = 1 ]; then
+    remove_old_secret "$src" "$dst"
+  else
+    copy_secret "$src" "$dst"
+  fi
+}
+
+detect_default_dst_collisions
+
+# Default org: top-level $DATA/providers/*.secrets.json → $DATA/default/providers/
+if [ -d "$DATA/providers" ]; then
+  for f in "$DATA"/providers/*.secrets.json; do
+    [ -f "$f" ] || continue
+    process_secret "$f" "$DATA/default/providers/$(basename "$f")"
+  done
+fi
+
+# Non-default orgs: $DATA/providers/<org>/*.secrets.json → $DATA/<org>/providers/
+if [ -d "$DATA/providers" ]; then
+  for d in "$DATA"/providers/*/; do
+    [ -d "$d" ] || continue
+    org="$(basename "$d")"
+    case "$org" in
+      .*) continue ;;
+    esac
+    # Validate against ORG_SLUG_REGEX (keep in sync with
+    # services/platform/lib/shared/constants/org-slug.ts). No length
+    # cap here — the canonical validator imposes none, and silently
+    # dropping long-but-valid slugs would lose their secrets on
+    # --cleanup-old. Anything that fails the shape is recorded as an
+    # error + conflict so the summary surfaces it (legacy slugs from a
+    # prior, more-permissive regime get an actionable diagnostic
+    # rather than disappearing).
+    if ! [[ "$org" =~ ^[a-z0-9][a-z0-9_-]*$ ]]; then
+      conflicts+=("invalid org slug under providers/: $org")
+      errors=$((errors+1))
+      echo "ERROR: providers/$org/ has invalid slug shape; manual reconcile required" >&2
+      continue
+    fi
+    for f in "$d"*.secrets.json; do
+      [ -f "$f" ] || continue
+      process_secret "$f" "$DATA/$org/providers/$(basename "$f")"
+    done
+  done
+fi
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo
+if [ "$CLEANUP_OLD" = 1 ]; then
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_SUMMARY: planned=$planned removed=0 errors=$errors (cleanup-old --dry-run)"
+  else
+    echo "MIGRATE_SUMMARY: removed=$removed errors=$errors (cleanup-old)"
+  fi
+else
+  if [ "$DRY_RUN" = 1 ]; then
+    echo "MIGRATE_SUMMARY: planned=$planned copied=0 skipped=$skipped errors=$errors (--dry-run)"
+  else
+    echo "MIGRATE_SUMMARY: copied=$copied skipped=$skipped errors=$errors"
+  fi
+  if [ "$copied" -gt 0 ] || [ "$planned" -gt 0 ]; then
+    echo "Next: run 'tale deploy --override-all -y' to recreate convex with the new code and seed non-default orgs."
+  fi
+fi
+if [ "\${#conflicts[@]}" -gt 0 ]; then
+  echo
+  echo "Unresolved conflicts (require manual reconciliation):"
+  for c in "\${conflicts[@]}"; do
+    echo "  - $c"
+  done
+fi
+[ "$errors" -eq 0 ] || exit 1
+`;
+
 export async function migrateConfigLayout(
   options: MigrateConfigLayoutOptions,
 ): Promise<void> {
@@ -69,8 +281,6 @@ export async function migrateConfigLayout(
     );
   }
 
-  const script = await loadScript();
-
   const scriptArgs: string[] = [];
   if (dryRun) scriptArgs.push('--dry-run');
   if (cleanupOld) scriptArgs.push('--cleanup-old');
@@ -95,7 +305,7 @@ export async function migrateConfigLayout(
   const result = await exec(
     'docker',
     ['exec', '-i', containerName, 'bash', '-s', '--', ...scriptArgs],
-    { stdin: script },
+    { stdin: MIGRATE_SCRIPT },
   );
 
   if (result.stdout) logger.info(result.stdout);
diff --git a/tools/cli/src/lib/migrate-config-layout/script.sh b/tools/cli/src/lib/migrate-config-layout/script.sh
deleted file mode 100644
index da030cf639..0000000000
--- a/tools/cli/src/lib/migrate-config-layout/script.sh
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/bin/bash
-# Migrate providers/*.secrets.json from old per-domain layout to new
-# org-first layout. Idempotent. Uses cp (not mv) so old paths remain
-# readable until the operator runs `tale migrate config-layout --cleanup-old`.
-#
-# Old → new mapping:
-#   $DATA/providers/<name>.secrets.json
-#     → $DATA/default/providers/<name>.secrets.json
-#   $DATA/providers/<org>/<name>.secrets.json
-#     → $DATA/<org>/providers/<name>.secrets.json
-#
-# Scope: providers/*.secrets.json ONLY. Non-secret config is reseeded by
-# `tale deploy --override-all` against the builtin catalog; non-provider
-# .history/ trails under old paths are intentionally abandoned (the user's
-# "secrets only" runbook trade-off).
-#
-# Designed to run against the CURRENTLY-running convex container (old
-# image, old code paths still active). cp leaves old paths in place so
-# old code keeps reading providers correctly until the operator runs
-# `tale deploy --override-all -y` to recreate convex with the new code.
-set -euo pipefail
-
-DRY_RUN=0
-CLEANUP_OLD=0
-for arg in "$@"; do
-  case "$arg" in
-    --dry-run) DRY_RUN=1 ;;
-    --cleanup-old) CLEANUP_OLD=1 ;;
-    *) echo "Unknown arg: $arg" >&2; exit 2 ;;
-  esac
-done
-
-# Defense in depth: `set -u` already aborts on unset $DATA, but ${VAR:?…}
-# gives a clearer error message and won't be defeated by a future `set
-# +u` somewhere downstream. Critical because some branches below build
-# absolute paths from $DATA and rm them — a silent empty would operate
-# from the container's filesystem root.
-DATA="${TALE_CONFIG_DIR:-/app/data}"
-: "${DATA:?DATA must be a non-empty absolute path}"
-APP_UID=1001
-APP_GID=1001
-
-planned=0
-copied=0
-skipped=0
-removed=0
-errors=0
-conflicts=()
-
-# Move a single .secrets.json from old to new path. cp -a preserves mode +
-# ownership (encrypted secrets are 0600 owner:app). Idempotent: if the
-# destination already exists, verify byte-for-byte equality (then skip)
-# rather than overwriting — protects a concurrent UI-side `atomicWriteSecret`
-# that landed at the new path between this script's check and its copy.
-copy_secret() {
-  local src="$1" dst="$2"
-  local dst_dir; dst_dir="$(dirname "$dst")"
-  if [ -e "$dst" ]; then
-    if cmp -s "$src" "$dst" 2>/dev/null; then
-      # SKIP belongs to stdout (informational, expected on re-run);
-      # only true ERROR lines go to stderr so the CLI wrapper can
-      # distinguish noise from real failures.
-      skipped=$((skipped+1)); echo "SKIP (already migrated): $src"
-      return 0
-    else
-      conflicts+=("$src != $dst")
-      errors=$((errors+1))
-      echo "ERROR: $dst exists but differs from $src; refusing to overwrite" >&2
-      return 0
-    fi
-  fi
-  if [ "$DRY_RUN" = 1 ]; then
-    echo "MIGRATE_PLAN: mkdir -p $dst_dir && cp -a $src $dst"
-    planned=$((planned+1))
-    return 0
-  fi
-  mkdir -p "$dst_dir"
-  chown "$APP_UID:$APP_GID" "$dst_dir" 2>/dev/null || true
-  cp -a "$src" "$dst"
-  copied=$((copied+1))
-  echo "OK: $src -> $dst"
-}
-
-# Remove an old-path secret IF the new-path copy exists and matches
-# byte-for-byte. Refuses any mismatch — operator must reconcile manually.
-remove_old_secret() {
-  local old="$1" new="$2"
-  if [ ! -e "$old" ]; then return 0; fi
-  if [ ! -e "$new" ]; then
-    conflicts+=("missing new-path counterpart for $old (expected $new)")
-    errors=$((errors+1))
-    echo "ERROR: $new does not exist; refusing to remove $old" >&2
-    return 0
-  fi
-  if ! cmp -s "$old" "$new" 2>/dev/null; then
-    conflicts+=("$old != $new")
-    errors=$((errors+1))
-    echo "ERROR: $old and $new differ; refusing to remove $old" >&2
-    return 0
-  fi
-  if [ "$DRY_RUN" = 1 ]; then
-    echo "CLEANUP_PLAN: rm $old"
-    planned=$((planned+1))
-    return 0
-  fi
-  rm -f "$old"
-  removed=$((removed+1))
-  echo "REMOVED: $old"
-}
-
-# ---------------------------------------------------------------------------
-# Enumeration
-# ---------------------------------------------------------------------------
-
-# Pre-scan: flag when both the flat path (providers/foo.secrets.json)
-# and the nested path (providers/default/foo.secrets.json) would map to
-# the same destination. Without this, copy_secret's per-pair cmp -s
-# would surface only one of the two as an error, leaving the operator
-# guessing which source was the "real" one.
-detect_default_dst_collisions() {
-  [ -d "$DATA/providers/default" ] || return 0
-  for f in "$DATA"/providers/*.secrets.json; do
-    [ -f "$f" ] || continue
-    local base nested
-    base="$(basename "$f")"
-    nested="$DATA/providers/default/$base"
-    if [ -f "$nested" ]; then
-      conflicts+=("dst collision: $f and $nested both map to $DATA/default/providers/$base")
-      errors=$((errors+1))
-      echo "ERROR: $f and $nested both target $DATA/default/providers/$base; manual reconcile required" >&2
-    fi
-  done
-}
-
-process_secret() {
-  local src="$1" dst="$2"
-  if [ "$CLEANUP_OLD" = 1 ]; then
-    remove_old_secret "$src" "$dst"
-  else
-    copy_secret "$src" "$dst"
-  fi
-}
-
-detect_default_dst_collisions
-
-# Default org: top-level $DATA/providers/*.secrets.json → $DATA/default/providers/
-if [ -d "$DATA/providers" ]; then
-  for f in "$DATA"/providers/*.secrets.json; do
-    [ -f "$f" ] || continue
-    process_secret "$f" "$DATA/default/providers/$(basename "$f")"
-  done
-fi
-
-# Non-default orgs: $DATA/providers/<org>/*.secrets.json → $DATA/<org>/providers/
-if [ -d "$DATA/providers" ]; then
-  for d in "$DATA"/providers/*/; do
-    [ -d "$d" ] || continue
-    org="$(basename "$d")"
-    case "$org" in
-      .*) continue ;;
-    esac
-    # Validate against ORG_SLUG_REGEX (keep in sync with
-    # services/platform/lib/shared/constants/org-slug.ts). No length
-    # cap here — the canonical validator imposes none, and silently
-    # dropping long-but-valid slugs would lose their secrets on
-    # --cleanup-old. Anything that fails the shape is recorded as an
-    # error + conflict so the summary surfaces it (legacy slugs from a
-    # prior, more-permissive regime get an actionable diagnostic
-    # rather than disappearing).
-    if ! [[ "$org" =~ ^[a-z0-9][a-z0-9_-]*$ ]]; then
-      conflicts+=("invalid org slug under providers/: $org")
-      errors=$((errors+1))
-      echo "ERROR: providers/$org/ has invalid slug shape; manual reconcile required" >&2
-      continue
-    fi
-    for f in "$d"*.secrets.json; do
-      [ -f "$f" ] || continue
-      process_secret "$f" "$DATA/$org/providers/$(basename "$f")"
-    done
-  done
-fi
-
-# ---------------------------------------------------------------------------
-# Summary
-# ---------------------------------------------------------------------------
-echo
-if [ "$CLEANUP_OLD" = 1 ]; then
-  if [ "$DRY_RUN" = 1 ]; then
-    echo "MIGRATE_SUMMARY: planned=$planned removed=0 errors=$errors (cleanup-old --dry-run)"
-  else
-    echo "MIGRATE_SUMMARY: removed=$removed errors=$errors (cleanup-old)"
-  fi
-else
-  if [ "$DRY_RUN" = 1 ]; then
-    echo "MIGRATE_SUMMARY: planned=$planned copied=0 skipped=$skipped errors=$errors (--dry-run)"
-  else
-    echo "MIGRATE_SUMMARY: copied=$copied skipped=$skipped errors=$errors"
-  fi
-  if [ "$copied" -gt 0 ] || [ "$planned" -gt 0 ]; then
-    echo "Next: run 'tale deploy --override-all -y' to recreate convex with the new code and seed non-default orgs."
-  fi
-fi
-if [ "${#conflicts[@]}" -gt 0 ]; then
-  echo
-  echo "Unresolved conflicts (require manual reconciliation):"
-  for c in "${conflicts[@]}"; do
-    echo "  - $c"
-  done
-fi
-[ "$errors" -eq 0 ] || exit 1

From 4ac82a33f069dc0fc7c6219873df0c9d6bc08e1c Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:07:03 +0800
Subject: [PATCH 09/41] fix(cli): stop admin-key leaking via reseed stdout +
 drop dead JSON parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P0-4 and P1-35 from the multi-agent review.

P0-4 — admin-key leak. RESEED_SCRIPT did `source /app/generate-admin-
key.sh`, which doesn't just define helpers — it `echo`s a dashboard
banner including `   Admin Key:      <key>` (3-space indent, capital
"K"). The line-based grep filter was anchored on `^Admin key` so it
mis-matched both the case AND the leading whitespace, and the key
landed in `result.stdout` which is then `logger.info`-ed on both the
failure path (line 179) and the success-no-parse path (line 230).
Reseed runs on `tale deploy --override-all`, which is exactly what
operators execute in CI — the leak path was through CI logs.

Fix is two-layered:
- Structural: drop the `source generate-admin-key.sh` line. The
  function `ensure_instance_secret` and the binary `generate_key` are
  already available from sourcing `env.sh`, and the inline `ADMIN_KEY=
  $(generate_key …)` re-derives the key without firing the banner.
  Mirrors scripts/2026-03-28-migrate-convex-data.sh:120-131.
- Defense in depth: new `redactAdminKey(text)` regex-strips any
  `[Aa]dmin [Kk]ey: <12+ chars>` pattern from stdout/stderr before
  it reaches the logger. Catches any future upstream banner (env.sh
  diagnostic mode, Convex CLI change, etc.) without breaking the
  primary fix. Applied at both surviving log sites.

P1-35 — dead JSON parser on failure path. The convex action
`reseed_all_orgs.ts:147-167` only `console.log`s human-readable strings
then throws; `bunx convex run` does not emit the action's return value
on the throw path. The CLI's `parseTrailingJson` failure-branch
(lines 196-212) therefore always returned null and the special-case
error message never rendered. Removed the dead branch; the generic
"reseed action raised" path already surfaces per-org detail via the
stdout log just above (now redacted).

Verification:
- 7 new unit tests cover the redactor (case, indent, missing colon,
  multiple occurrences, no false positives on short tokens).
- `bun run check`: 36/36 tasks green.
---
 .../src/lib/actions/reseed-all-orgs.test.ts   | 53 +++++++++++++++++
 tools/cli/src/lib/actions/reseed-all-orgs.ts  | 59 +++++++++++--------
 2 files changed, 88 insertions(+), 24 deletions(-)
 create mode 100644 tools/cli/src/lib/actions/reseed-all-orgs.test.ts

diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.test.ts b/tools/cli/src/lib/actions/reseed-all-orgs.test.ts
new file mode 100644
index 0000000000..6b57fdeb53
--- /dev/null
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.test.ts
@@ -0,0 +1,53 @@
+import { describe, expect, test } from 'bun:test';
+
+import { redactAdminKey } from './reseed-all-orgs';
+
+describe('redactAdminKey', () => {
+  test('redacts the dashboard banner line', () => {
+    const input = '   Admin Key:      sk_abcdef1234567890+/=';
+    expect(redactAdminKey(input)).toBe('   Admin Key: <redacted>');
+  });
+
+  test('matches case-insensitively', () => {
+    const lower = 'admin key: 1234567890abcdef';
+    expect(redactAdminKey(lower)).toBe('admin key: <redacted>');
+  });
+
+  test('redacts even without leading whitespace', () => {
+    const input = 'Admin Key: AKIAIOSFODNN7EXAMPLEKEY';
+    expect(redactAdminKey(input)).toBe('Admin Key: <redacted>');
+  });
+
+  test('redacts when colon is missing', () => {
+    const input = 'Admin Key     AKIAIOSFODNN7EXAMPLEKEY';
+    expect(redactAdminKey(input)).toBe('Admin Key: <redacted>');
+  });
+
+  test('redacts multiple occurrences in the same stream', () => {
+    const input = [
+      'Setting up...',
+      '   Admin Key:      sk_first_key_1234567890',
+      'Done. Admin Key: sk_second_key_abcdef1234',
+    ].join('\n');
+    const out = redactAdminKey(input);
+    expect(out).not.toContain('sk_first_key');
+    expect(out).not.toContain('sk_second_key');
+    expect(out.match(/<redacted>/g)?.length).toBe(2);
+  });
+
+  test('leaves non-admin-key text alone', () => {
+    const input = [
+      'Reseeded 5/5 orgs from builtin catalog.',
+      'Per-org status:',
+      '  - default: ok',
+    ].join('\n');
+    expect(redactAdminKey(input)).toBe(input);
+  });
+
+  test('does not redact short tokens (avoids false positives)', () => {
+    // The 12-char minimum stops common patterns like `Admin key: ok` or
+    // `Admin Key: TBD` from being scrubbed and looking suspicious.
+    const input = 'Admin Key: TBD';
+    expect(redactAdminKey(input)).toBe('Admin Key: TBD');
+  });
+});
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index b11dfc1fa4..37805b7bc0 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -55,10 +55,20 @@ const RESEED_TIMEOUT_EXIT = 124;
 // outcome (grep exits 1) does not poison `set -o pipefail`. The real
 // signal is `bunx convex run`'s exit code, captured before the grep
 // strips banner lines.
+//
+// `generate-admin-key.sh` is intentionally NOT sourced here even though
+// it provides a complete admin-key derivation routine — the script also
+// echoes a dashboard banner including a `   Admin Key:      <key>` line
+// that would land in our captured stdout. Sourcing once leaked the key
+// past the line-based grep filter (the grep anchored on `^Admin key`
+// which mis-matched the lower-case 'k' AND the 3-space indent). Re-
+// derive ADMIN_KEY inline from env.sh's helpers (`ensure_instance_secret`
+// is exported by env.sh; `generate_key` is the binary on $PATH that the
+// official Convex Docker image uses) — see scripts/2026-03-28-migrate-
+// convex-data.sh:120-131 for the exact same pattern.
 const RESEED_SCRIPT = `set -eo pipefail
 source /app/env.sh
 env_normalize_common
-source /app/generate-admin-key.sh
 ensure_instance_secret
 ADMIN_KEY=$(generate_key "$INSTANCE_NAME" "$INSTANCE_SECRET")
 cd /app
@@ -70,6 +80,20 @@ HOME=/home/app timeout ${RESEED_TIMEOUT_S} bunx convex run \\
   | { grep -v "^Admin key\\|^📋\\|^✅ Admin\\|^━\\|^🌐\\|^$\\|Steps:\\|Open\\|Enter\\|Paste" || true; }
 `;
 
+/**
+ * Defense-in-depth redactor for the captured `bunx convex run` stdout.
+ * If anything upstream (env.sh's diagnostic mode, a future Convex CLI
+ * banner, etc.) prints an admin-key line that slips past the bash grep,
+ * this regex strips it before the value reaches the logger. Case-
+ * insensitive, anchors on any leading whitespace, and is intentionally
+ * conservative on the value charset (admin keys are base64/hex-like).
+ */
+const ADMIN_KEY_RE = /\b([Aa]dmin\s+[Kk]ey)\s*:?\s*[A-Za-z0-9+/=._-]{12,}/g;
+
+export function redactAdminKey(text: string): string {
+  return text.replace(ADMIN_KEY_RE, '$1: <redacted>');
+}
+
 const CONFIRM_MESSAGE =
   '--override-all will factory-reset every registered org from the builtin catalog. ' +
   '*.secrets.json files, .history/ trails, and uploaded branding/images/ are preserved; ' +
@@ -176,10 +200,10 @@ export async function reseedAllOrgsFromBuiltin(
   // exit code, which becomes `result.success === false` here.
   if (!result.success) {
     if (result.stdout) {
-      logger.info(result.stdout.trim());
+      logger.info(redactAdminKey(result.stdout.trim()));
     }
     if (result.stderr) {
-      logger.error(result.stderr.trim());
+      logger.error(redactAdminKey(result.stderr.trim()));
     }
 
     // Special-case `timeout(1)`'s SIGTERM exit so the operator sees
@@ -193,24 +217,11 @@ export async function reseedAllOrgsFromBuiltin(
       );
     }
 
-    // Parse the trailing JSON payload on the failure branch too — the
-    // action emits it before throwing so per-org slug detail survives
-    // the non-zero exit and reaches CI logs as structured data.
-    const failed = parseTrailingJson(result.stdout);
-    if (failed) {
-      const failedSlugs = failed.results
-        .filter(
-          (r): r is { slug: string; status: 'error'; error: string } =>
-            r.status === 'error',
-        )
-        .map((r) => `${r.slug}: ${r.error.split('\n')[0]}`)
-        .join('; ');
-      throw new Error(
-        `--override-all failed: ${failed.failed}/${failed.total} orgs raised — ${failedSlugs}. ` +
-          `Re-run after addressing the listed orgs (the action is idempotent).`,
-      );
-    }
-
+    // The convex-side action `console.log`s a human-readable failure
+    // summary then `throw`s — `bunx convex run` does NOT emit the
+    // action's return value on the throw path, so any attempt to parse
+    // structured failure detail here is dead code. The stdout logged
+    // above already surfaces the per-slug detail to the operator / CI.
     throw new Error(
       `--override-all failed: reseed action raised in ${container}. ` +
         `Per-org detail above; partial state on disk — re-run --override-all ` +
@@ -225,9 +236,9 @@ export async function reseedAllOrgsFromBuiltin(
       `Reseeded ${parsed.succeeded}/${parsed.total} orgs from builtin catalog.`,
     );
   } else if (result.stdout) {
-    // Couldn't parse — surface raw stdout so the operator isn't flying
-    // blind. Should be rare given the grep strip above.
-    logger.info(result.stdout.trim());
+    // Couldn't parse — surface raw stdout (redacted) so the operator
+    // isn't flying blind. Should be rare given the grep strip above.
+    logger.info(redactAdminKey(result.stdout.trim()));
   }
 
   logger.success('Reseed complete.');

From 2fb41df66e37c71b824b9d53ee7d7ef46d65950e Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:15:53 +0800
Subject: [PATCH 10/41] fix(platform): auth + per-org filter on /events/file
 SSE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P0-2. The watcher fan-out at server.ts:43-57 used to push every
config-change event to every connected client with zero auth and zero
per-org filter; client-side filters in `app/hooks/use-file-events.ts`
only suppressed the cache invalidation, not the wire payload. When
`TALE_FILE_EVENTS=true` (operator opt-in), an unauthenticated peer or
a cross-org member got a real-time inventory of every org's config
items (agent slugs, workflow names, integration ids, etc.).

Fix is two layers:

- New convex httpAction `/api/sse/auth` validates the Better Auth
  session cookie via the same `auth.api.getSession({ headers })`
  pattern as `/api/tts-audio`, then walks the `member` table to
  resolve the user's org slugs. Returns `{userId, orgSlugs}` on
  success or 401 with `Vary: Cookie` on missing/invalid session.

- The Bun `/events/file` handler forwards the request's Cookie to
  this endpoint (using `CONVEX_SITE_PROXY_URL` if set, else deriving
  `:3211` from `CONVEX_URL`'s `:3210`). On 401 the SSE connection is
  closed with 401 + `WWW-Authenticate: Cookie`. On success the
  resolved org-slug set is stored alongside the controller; the
  watcher's fan-out drops events whose `orgSlug` falls outside the
  client's allowed set BEFORE the payload reaches the wire.

The existing client-side filter in use-file-events.ts is now
redundant defense in depth — kept since wire-side filter requires
the server to know the active org, which it does, but layering
makes a future "send a global event to everyone" code change safer.

Verified:
- 3 new server.test.ts cases: no-cookie → 401, convex-rejects → 401,
  valid session + mocked convex response → 200 text/event-stream
  with Vary: Cookie. The pre-existing FILE_EVENTS_ENABLED=false 404
  case still passes.
- `bun run check`: 36/36 tasks green (70944 platform tests).
---
 services/platform/convex/http.ts |  74 ++++++++++++++++++++-
 services/platform/server.test.ts |  42 +++++++++++-
 services/platform/server.ts      | 110 ++++++++++++++++++++++++++++---
 3 files changed, 213 insertions(+), 13 deletions(-)

diff --git a/services/platform/convex/http.ts b/services/platform/convex/http.ts
index 0203163a92..fcc7af544f 100644
--- a/services/platform/convex/http.ts
+++ b/services/platform/convex/http.ts
@@ -1,6 +1,7 @@
 import { httpRouter } from 'convex/server';
 
-import { internal } from './_generated/api';
+import { getString, isRecord } from '../lib/utils/type-guards';
+import { components, internal } from './_generated/api';
 import { httpAction } from './_generated/server';
 import {
   listAgents as listAgentsRest,
@@ -301,6 +302,77 @@ http.route({
   }),
 });
 
+/**
+ * Resolve which org slugs a session-authenticated user is allowed to see
+ * events for. Consumed by the Bun-side `/events/file` SSE handler so the
+ * fan-out can drop events whose `orgSlug` is not in the caller's
+ * membership set — before any wire payload reaches the client.
+ *
+ * Returns `{ userId, orgSlugs }` on success or 401 on missing/invalid
+ * session. The 401 carries `Vary: Cookie` so a TLS-terminating proxy
+ * can't cache the response against the URL and starve a freshly-logged-
+ * in user.
+ */
+http.route({
+  path: '/api/sse/auth',
+  method: 'GET',
+  handler: httpAction(async (ctx, req) => {
+    const auth = createAuth(ctx);
+    const session = await auth.api.getSession({ headers: req.headers });
+    if (!session?.user) {
+      return new Response('Unauthenticated', {
+        status: 401,
+        headers: {
+          'Cache-Control': 'no-store',
+          Vary: 'Cookie',
+          'WWW-Authenticate': 'Cookie',
+        },
+      });
+    }
+
+    const memberships = await ctx.runQuery(
+      components.betterAuth.adapter.findMany,
+      {
+        model: 'member',
+        // Cap matches the platform's hard limit on per-user org membership.
+        paginationOpts: { cursor: null, numItems: 256 },
+        where: [{ field: 'userId', value: session.user.id, operator: 'eq' }],
+      },
+    );
+
+    const memberRows: unknown[] = Array.isArray(memberships?.page)
+      ? memberships.page
+      : [];
+    const orgIds: string[] = memberRows
+      .map((row) =>
+        isRecord(row) ? getString(row, 'organizationId') : undefined,
+      )
+      .filter((s): s is string => typeof s === 'string' && s.length > 0);
+
+    const slugs: string[] = [];
+    for (const orgId of orgIds) {
+      const orgRow = await ctx.runQuery(components.betterAuth.adapter.findOne, {
+        model: 'organization',
+        where: [{ field: '_id', value: orgId, operator: 'eq' }],
+      });
+      const slug = isRecord(orgRow) ? getString(orgRow, 'slug') : undefined;
+      if (typeof slug === 'string' && slug.length > 0) slugs.push(slug);
+    }
+
+    return new Response(
+      JSON.stringify({ userId: session.user.id, orgSlugs: slugs }),
+      {
+        status: 200,
+        headers: {
+          'Content-Type': 'application/json',
+          'Cache-Control': 'no-store',
+          Vary: 'Cookie',
+        },
+      },
+    );
+  }),
+});
+
 authComponent.registerRoutes(http, createAuth);
 
 // Integration OAuth2 Callback
diff --git a/services/platform/server.test.ts b/services/platform/server.test.ts
index a6dd6e1463..0ae010f445 100644
--- a/services/platform/server.test.ts
+++ b/services/platform/server.test.ts
@@ -1,6 +1,6 @@
 import * as vm from 'node:vm';
 
-import { describe, expect, test } from 'vitest';
+import { describe, expect, test, vi } from 'vitest';
 
 import { wrapCanvasPreviewHtml } from './lib/canvas-preview-shell';
 import { createApp } from './server';
@@ -360,14 +360,52 @@ describe('GET /status.json', () => {
 });
 
 describe('SSE /events/file', () => {
-  test('preserves text/event-stream content type and no-cache', async () => {
+  test('returns 401 when no session cookie is present', async () => {
     const app = createApp(baseEnv);
+    // No cookie → convex auth is never even called; the handler short-
+    // circuits the early-null branch in resolveAllowedOrgSlugs.
     const res = await app.fetch(new Request('http://localhost/events/file'));
+    expect(res.status).toBe(401);
+    expect(res.headers.get('www-authenticate')).toBe('Cookie');
+    expect(res.headers.get('vary')).toBe('Cookie');
+  });
+
+  test('returns 401 when convex auth lookup rejects the session', async () => {
+    const fetchSpy = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('Unauthenticated', { status: 401 }));
+    const app = createApp(baseEnv);
+    const res = await app.fetch(
+      new Request('http://localhost/events/file', {
+        headers: { cookie: 'better-auth.session_token=invalid' },
+      }),
+    );
+    expect(res.status).toBe(401);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+    fetchSpy.mockRestore();
+  });
+
+  test('streams text/event-stream when session resolves to org memberships', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
+      new Response(JSON.stringify({ userId: 'u1', orgSlugs: ['acme'] }), {
+        status: 200,
+        headers: { 'content-type': 'application/json' },
+      }),
+    );
+    const app = createApp(baseEnv);
+    const res = await app.fetch(
+      new Request('http://localhost/events/file', {
+        headers: { cookie: 'better-auth.session_token=valid' },
+      }),
+    );
+    expect(res.status).toBe(200);
     expect(res.headers.get('content-type')).toBe('text/event-stream');
     expect(res.headers.get('cache-control')).toBe('no-cache');
+    expect(res.headers.get('vary')).toBe('Cookie');
     expect(res.body).toBeInstanceOf(ReadableStream);
     // Cancel to drop the SSE client and avoid leaking the controller.
     await res.body?.cancel();
+    fetchSpy.mockRestore();
   });
 
   test('returns 404 when FILE_EVENTS_ENABLED is false', async () => {
diff --git a/services/platform/server.ts b/services/platform/server.ts
index 56bda4d68e..f56f468e8e 100644
--- a/services/platform/server.ts
+++ b/services/platform/server.ts
@@ -33,7 +33,18 @@ const SHUTDOWN_MARKER = '/tmp/platform-shutting-down';
 // TanStack Query caches without a full page reload.
 // ---------------------------------------------------------------------------
 
-const sseClients = new Set<ReadableStreamDefaultController>();
+interface SseClient {
+  controller: ReadableStreamDefaultController;
+  // Org slugs the connected user is a member of. Watcher events whose
+  // `orgSlug` falls outside this set are dropped before the payload
+  // hits the wire — closes the cross-org metadata leak that
+  // unauthenticated / cross-org clients otherwise saw via the SSE
+  // stream. `null` means "platform-wide / org-agnostic event" (rare;
+  // currently only the `{type:"connected"}` ping).
+  allowedOrgSlugs: Set<string>;
+}
+
+const sseClients = new Set<SseClient>();
 
 const fileEventsEnabled = process.env.TALE_FILE_EVENTS === 'true';
 const configDir = process.env.TALE_CONFIG_DIR;
@@ -44,18 +55,75 @@ if (fileEventsEnabled && configDir && existsSync(configDir)) {
   const watcher = createConfigWatcher(configDir);
   watcher.onChange((event) => {
     const payload = `data: ${JSON.stringify(event)}\n\n`;
-    for (const controller of sseClients) {
+    // Per-event org filter. Every config-watcher event carries an
+    // `orgSlug` (see lib/config-watcher.ts: parseConfigChange always
+    // sets it for valid paths). If a future event type appears without
+    // a slug, default-deny — the legacy fan-out-to-everyone behavior is
+    // what this fix is closing.
+    const eventOrg =
+      typeof event === 'object' && event !== null && 'orgSlug' in event
+        ? (event as { orgSlug?: string }).orgSlug
+        : undefined;
+    for (const client of sseClients) {
+      if (eventOrg && !client.allowedOrgSlugs.has(eventOrg)) continue;
       try {
-        controller.enqueue(payload);
+        client.controller.enqueue(payload);
       } catch (err) {
         console.warn('SSE enqueue failed; dropping client', err);
-        sseClients.delete(controller);
+        sseClients.delete(client);
       }
     }
   });
   console.log(`Config file watcher active: ${configDir}`);
 }
 
+/**
+ * Resolve the org slugs the current session is allowed to receive
+ * events for by forwarding the request's Cookie header to Convex's
+ * `/api/sse/auth` httpAction. Returns null on missing/invalid session
+ * (the SSE handler then closes the connection with 401).
+ *
+ * `CONVEX_SITE_PROXY_URL` overrides the derived URL for dev — see
+ * vite.config.ts. In compose the convex HTTP-actions port is `:3211`
+ * on the same internal hostname as the WS API (`:3210` from
+ * CONVEX_URL).
+ */
+function convexHttpActionsBaseUrl(): string {
+  if (process.env.CONVEX_SITE_PROXY_URL) {
+    return process.env.CONVEX_SITE_PROXY_URL.replace(/\/$/, '');
+  }
+  const wsUrl = process.env.CONVEX_URL ?? 'http://convex:3210';
+  return wsUrl.replace(/:\d+$/, ':3211').replace(/\/$/, '');
+}
+
+async function resolveAllowedOrgSlugs(
+  cookieHeader: string | undefined,
+): Promise<Set<string> | null> {
+  if (!cookieHeader) return null;
+  try {
+    const res = await fetch(`${convexHttpActionsBaseUrl()}/api/sse/auth`, {
+      headers: { cookie: cookieHeader },
+    });
+    if (res.status === 401) return null;
+    if (!res.ok) {
+      console.warn(`[/events/file] convex auth lookup returned ${res.status}`);
+      return null;
+    }
+    const body: unknown = await res.json();
+    const slugs =
+      body && typeof body === 'object' && 'orgSlugs' in body
+        ? (body as { orgSlugs: unknown }).orgSlugs
+        : null;
+    if (!Array.isArray(slugs)) return new Set();
+    return new Set(
+      slugs.filter((s): s is string => typeof s === 'string' && s.length > 0),
+    );
+  } catch (err) {
+    console.warn('[/events/file] convex auth lookup failed', err);
+    return null;
+  }
+}
+
 // ---------------------------------------------------------------------------
 
 function escapeHtmlAttr(value: string) {
@@ -336,24 +404,46 @@ export function createApp(env: EnvConfig = getEnvConfig()): Hono {
     });
   });
 
-  app.get('/events/file', (c) => {
+  app.get('/events/file', async (c) => {
     if (!env.FILE_EVENTS_ENABLED) return c.notFound();
 
-    let ctrl: ReadableStreamDefaultController;
+    // Auth gate. SSE clients (EventSource) cannot set Authorization
+    // headers but DO send same-origin cookies, so we forward the
+    // request's Cookie to Convex's `/api/sse/auth` httpAction which
+    // validates the Better Auth session and returns the user's org
+    // memberships. Anonymous / cross-tenant fan-out used to leak
+    // every org's config-item names; per-client `allowedOrgSlugs`
+    // gates events at fan-out time so foreign-org payloads never
+    // reach the wire.
+    const cookieHeader = c.req.header('cookie');
+    const allowedOrgSlugs = await resolveAllowedOrgSlugs(cookieHeader);
+    if (allowedOrgSlugs === null) {
+      return new Response('Unauthenticated', {
+        status: 401,
+        headers: {
+          'Cache-Control': 'no-store',
+          Vary: 'Cookie',
+          'WWW-Authenticate': 'Cookie',
+        },
+      });
+    }
+
+    let client: SseClient;
     const stream = new ReadableStream({
       start(controller) {
-        ctrl = controller;
-        sseClients.add(ctrl);
-        ctrl.enqueue('data: {"type":"connected"}\n\n');
+        client = { controller, allowedOrgSlugs };
+        sseClients.add(client);
+        controller.enqueue('data: {"type":"connected"}\n\n');
       },
       cancel() {
-        sseClients.delete(ctrl);
+        sseClients.delete(client);
       },
     });
     return new Response(stream, {
       headers: {
         'Content-Type': 'text/event-stream',
         'Cache-Control': 'no-cache',
+        Vary: 'Cookie',
       },
     });
   });

From 6e7cc6841542d25bc7446a8aa8112ba3d8e13a86 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:28:02 +0800
Subject: [PATCH 11/41] fix(platform): close cross-tenant gaps on workflow +
 websites actions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P1-1, P1-3, P1-4, P1-5 from the multi-agent review. Each is a
cross-tenant or content-injection hole the org-first refactor surfaced
or widened. None had auth tests; all four fixes are pattern-aligned to
the sibling code path that was already doing the right thing.

P1-1 — `rag_action.upload_document` skipped the org gate. Every other
op in `rag_action.ts` (delete/get_chunks/search) calls
`assertStorageIdsInOrg` first, but upload went straight to
`uploadDocument(ctx, fileId, ...)`. The helper derives orgSlug from
file-metadata, so an org-A workflow could force ingestion of an org-B
storage blob into org-B's RAG namespace — cost shift + content
injection under attacker-controlled fileName/contentType. One-line
gate added at the same spot as the sibling ops.

P1-5 — workflow `get_chunks` / `search` skipped
`stripReservedPromptTags`. The agent-tool path applies the SEC1
strip at rag_search_tool.ts:319/483; the workflow path passed
chunk content through untouched, letting an attacker-uploaded
`<system>…</system>` escape the surrounding workflow system prompt.
Strip is applied BEFORE the existing video-link `wrapUntrusted`
layer so both defenses compose correctly.

P1-4 — `websites/actions.ts` fetchPages/fetchChunks/searchContent
authenticated the user but never called `verifyOrganizationMembership`
on `website.organizationId`. Sibling actions (deleteWebsite,
updateWebsite, createWebsite) already do the check; the REST surface
(rest_api.ts:103-105) does too. Pre-existing on `main` but materially
widened by the org-first refactor: pre-branch the crawler was
effectively single-tenant per deployment so a leaked websiteId hit a
shared scope; on this branch the crawler is org-scoped via
website.organizationId resolved from the row, so cross-org calls now
return the *target org's* private pages/chunks/search hits to any
authenticated caller. Extracted a small `loadOwnedWebsite(ctx,
websiteId)` helper so the three handlers stay one-liners and the
auth pattern can't drift on the next addition. Errors uniformly
say "Website not found" to avoid existence disclosure across orgs.

P1-3 — workflow `retrieve` skipped the team-ACL gate that its
agent-tool sibling (retrieve_document.ts:42-58) enforces. The comment
at document_action.ts:278 even claimed parity: only the first half
(`findDocumentByFileId`) was implemented; `getAccessibleDocumentIds`
was missing. Same-org members of a different team could read foreign-
team documents. Same gap audited in `update` (line 219) and
`get_metadata` (line 540) — both also resolve by fileId via
`findDocumentByFileId` and were missing the team check.
`compare`/`extract_docx_structured`/`apply_docx_structured` operate
on storage IDs (not docs rows) so `verifyStorageIdsBelongToOrg`
covers them; they have no team field to ACL.

Extracted `assertDocumentAccessibleInWorkflow(ctx, organizationId,
userId, document, fileId)` helper in document_action.ts. When userId
is absent from `_variables` (system-triggered workflows), the team
gate degrades to org-only — consistent with the existing optional-
userId pattern at lines 424/651/746. For `get_metadata` the
accessibleIds list is loaded once and per-id filtering happens in
the map loop; team-private fields (sourceCreatedAt/Modified,
docMetadata) are dropped for inaccessible docs while fileName from
fileMetadata still surfaces so legitimate name-only workflows don't
break.

Verified: bun run check 36/36 tasks green (70944 platform tests).
---
 services/platform/convex/websites/actions.ts  | 61 +++++++-----
 .../action_defs/document/document_action.ts   | 94 +++++++++++++++++--
 .../action_defs/rag/rag_action.ts             | 28 +++++-
 3 files changed, 147 insertions(+), 36 deletions(-)

diff --git a/services/platform/convex/websites/actions.ts b/services/platform/convex/websites/actions.ts
index c4aac91d0d..9448dda6c1 100644
--- a/services/platform/convex/websites/actions.ts
+++ b/services/platform/convex/websites/actions.ts
@@ -2,6 +2,7 @@ import { v } from 'convex/values';
 
 import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
+import type { ActionCtx } from '../_generated/server';
 import { action } from '../_generated/server';
 import { authComponent } from '../auth';
 import { orgSlugFromId } from '../lib/helpers/org_slug';
@@ -16,6 +17,39 @@ import type {
   SearchContentResult,
 } from './types';
 
+/**
+ * Resolve a websiteId, verify the caller's org membership, return both.
+ * Centralises the auth pattern that every read-side action in this file
+ * needs (deleteWebsite / updateWebsite already do it inline; fetchPages /
+ * fetchChunks / searchContent used to skip it, returning the foreign
+ * org's private content to any authenticated caller — round-2 P1-4).
+ *
+ * Uses "Website not found" for both "no row" and "wrong org" so a cross-
+ * org caller can't probe website existence by status code.
+ */
+async function loadOwnedWebsite(ctx: ActionCtx, websiteId: Id<'websites'>) {
+  const authUser = await authComponent.getAuthUser(ctx);
+  if (!authUser) throw new Error('Unauthenticated');
+
+  const website = await ctx.runQuery(
+    internal.websites.internal_queries.getWebsite,
+    { websiteId },
+  );
+  if (!website) throw new Error('Website not found');
+
+  await ctx.runQuery(
+    internal.websites.internal_queries.verifyOrganizationMembership,
+    {
+      organizationId: website.organizationId,
+      userId: authUser._id,
+      email: authUser.email,
+      name: authUser.name,
+    },
+  );
+
+  return { website, authUser };
+}
+
 export const createWebsite = action({
   args: {
     organizationId: v.string(),
@@ -235,14 +269,7 @@ export const fetchPages = action({
     hasMore: v.boolean(),
   }),
   handler: async (ctx, args): Promise<FetchPagesResult> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-
-    const website = await ctx.runQuery(
-      internal.websites.internal_queries.getWebsite,
-      { websiteId: args.websiteId },
-    );
-    if (!website) throw new Error('Website not found');
+    const { website } = await loadOwnedWebsite(ctx, args.websiteId);
 
     // Trigger async metadata sync from crawler
     await ctx.scheduler.runAfter(
@@ -273,14 +300,7 @@ export const fetchChunks = action({
     url: v.string(),
   },
   handler: async (ctx, args): Promise<FetchChunksResult> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-
-    const website = await ctx.runQuery(
-      internal.websites.internal_queries.getWebsite,
-      { websiteId: args.websiteId },
-    );
-    if (!website) throw new Error('Website not found');
+    const { website } = await loadOwnedWebsite(ctx, args.websiteId);
 
     return await ctx.runAction(
       internal.websites.internal_actions.fetchPageChunks,
@@ -300,14 +320,7 @@ export const searchContent = action({
     limit: v.optional(v.number()),
   },
   handler: async (ctx, args): Promise<SearchContentResult> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-
-    const website = await ctx.runQuery(
-      internal.websites.internal_queries.getWebsite,
-      { websiteId: args.websiteId },
-    );
-    if (!website) throw new Error('Website not found');
+    const { website } = await loadOwnedWebsite(ctx, args.websiteId);
 
     return await ctx.runAction(
       internal.websites.internal_actions.searchWebsiteContent,
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index 1b03bcc2ef..6abc07e777 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -48,6 +48,39 @@ async function resolveStorageUrl(
   return fileUrl;
 }
 
+/**
+ * Document-row access gate for workflow ops that resolve a `documents`
+ * row by `fileId`. Mirrors the agent-tool sibling
+ * `retrieve_document.ts:42-58`: same-org gate via `findDocumentByFileId`
+ * (already inline in callers) PLUS team-ACL gate via
+ * `getAccessibleDocumentIds` (was missing — same-org members of a
+ * different team could read foreign-team documents).
+ *
+ * When `userId` is absent from `_variables` (system-triggered workflows
+ * that don't impersonate a user), the team-ACL gate is skipped — the
+ * org-membership gate above already constrains scope and there's no
+ * member identity to scope further. Throws Error on access denied.
+ */
+async function assertDocumentAccessibleInWorkflow(
+  ctx: ActionCtx,
+  organizationId: string,
+  userId: string | undefined,
+  document: { _id: string },
+  fileId: string,
+): Promise<void> {
+  if (!userId) return;
+  const accessibleIds: string[] = await ctx.runQuery(
+    internal.documents.internal_queries.getAccessibleDocumentIds,
+    { organizationId, userId },
+  );
+  if (!accessibleIds.includes(document._id)) {
+    throw new Error(
+      `Access denied for document "${fileId}". ` +
+        "You may not have access to this document's team.",
+    );
+  }
+}
+
 async function resolveFileName(
   ctx: ActionCtx,
   fileId: string,
@@ -221,6 +254,8 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
             : undefined;
+        const userId =
+          typeof _variables.userId === 'string' ? _variables.userId : undefined;
 
         if (!organizationId) {
           throw new Error(
@@ -237,6 +272,14 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
           throw new Error(`Document not found for file ID "${params.fileId}"`);
         }
 
+        await assertDocumentAccessibleInWorkflow(
+          ctx,
+          organizationId,
+          userId,
+          document,
+          params.fileId,
+        );
+
         const documentId = document._id;
 
         await ctx.runMutation(
@@ -281,6 +324,8 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
             : undefined;
+        const userId =
+          typeof _variables.userId === 'string' ? _variables.userId : undefined;
         if (!organizationId) {
           throw new Error(
             'organizationId is required in workflow variables to retrieve a document',
@@ -295,6 +340,13 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
             `Document with file ID "${params.fileId}" not found in this organization`,
           );
         }
+        await assertDocumentAccessibleInWorkflow(
+          ctx,
+          organizationId,
+          userId,
+          ownsDocument,
+          params.fileId,
+        );
         const retrieveOrgSlug = await orgSlugFromId(ctx, organizationId);
         const result = await fetchDocumentContent(
           retrieveOrgSlug,
@@ -542,6 +594,21 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
           typeof _variables.organizationId === 'string'
             ? _variables.organizationId
             : undefined;
+        const userId =
+          typeof _variables.userId === 'string' ? _variables.userId : undefined;
+
+        // Team-ACL gate: load the caller's accessible documentIds once
+        // (cheaper than once per fileId) and filter in the per-id loop
+        // below. Only applies when both organizationId and userId are
+        // known — system-triggered workflows (no userId) get the
+        // org-membership gate only, consistent with the other doc ops.
+        const accessibleIds: string[] | null =
+          organizationId && userId
+            ? await ctx.runQuery(
+                internal.documents.internal_queries.getAccessibleDocumentIds,
+                { organizationId, userId },
+              )
+            : null;
 
         const results = await Promise.all(
           params.fileIds.map(async (fileId) => {
@@ -558,27 +625,38 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
                 : Promise.resolve(undefined),
             ]);
 
+            // Drop the docs-row if the caller doesn't have access to its
+            // team. fileMetadata + base name still surface so workflow
+            // steps that only need fileName don't break — but team-
+            // private fields (sourceCreatedAt/sourceModifiedAt/
+            // lastModified, docMetadata) are gated below.
+            const visibleDocument =
+              document &&
+              (accessibleIds === null || accessibleIds.includes(document._id))
+                ? document
+                : undefined;
+
             /* oxlint-disable typescript/no-unsafe-type-assertion -- metadata is a generic JSON record from Convex schema; runtime guard ensures it's an object before narrowing */
             const docMetadata =
-              document?.metadata != null &&
-              typeof document.metadata === 'object'
-                ? (document.metadata as DocumentMetadata)
+              visibleDocument?.metadata != null &&
+              typeof visibleDocument.metadata === 'object'
+                ? (visibleDocument.metadata as DocumentMetadata)
                 : undefined;
             /* oxlint-enable typescript/no-unsafe-type-assertion */
 
-            const lastModified = document
+            const lastModified = visibleDocument
               ? getDocumentEffectiveDate(
-                  document,
+                  visibleDocument,
                   docMetadata,
-                  document._creationTime,
+                  visibleDocument._creationTime,
                 )
               : undefined;
 
             return {
               fileId,
               fileName: fileMetadata?.fileName ?? 'Unknown',
-              sourceCreatedAt: document?.sourceCreatedAt,
-              sourceModifiedAt: document?.sourceModifiedAt,
+              sourceCreatedAt: visibleDocument?.sourceCreatedAt,
+              sourceModifiedAt: visibleDocument?.sourceModifiedAt,
               lastModified,
             };
           }),
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index 781c85f580..448f35e7bd 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -5,6 +5,7 @@ import { internal } from '../../../_generated/api';
 import type { ActionCtx } from '../../../_generated/server';
 import type { SearchResponse } from '../../../agent_tools/rag/format_search_results';
 import { fetchDocumentChunks } from '../../../agent_tools/rag/helpers/fetch_document_chunks';
+import { stripReservedPromptTags } from '../../../lib/agent_response/sanitize_prompt';
 import { UpstreamHttpError } from '../../../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../../../lib/helpers/org_slug';
 import { ragFetch } from '../../../lib/helpers/rag_config';
@@ -56,6 +57,12 @@ export const ragAction: ActionDefinition<RagActionParams> = {
 
     switch (migratedParams.operation) {
       case 'upload_document': {
+        // Cross-tenant gate: without this, org A's workflow can force
+        // ingestion of org B's storage blob (the helper would resolve
+        // org B's slug from file metadata and index into org B's RAG
+        // namespace — cost shift + content injection). Mirror the
+        // delete/get_chunks/search ops which all gate first.
+        await assertStorageIdsInOrg(ctx, _variables, [migratedParams.fileId]);
         const result = await uploadDocument(ctx, migratedParams.fileId, {
           sync: migratedParams.sync,
           fileName: migratedParams.fileName,
@@ -89,6 +96,15 @@ export const ragAction: ActionDefinition<RagActionParams> = {
           orgSlug,
           migratedParams.fileId,
         );
+        // SEC1: indexed-doc chunks may contain `<system>…</system>` or
+        // other reserved wrapper tags that would otherwise escape the
+        // workflow's downstream system prompt. Strip BEFORE any further
+        // wrapping (the video-link `wrapUntrusted` then layers on top).
+        // Mirrors `rag_search_tool.ts:319` (agent-tool retrieve path).
+        result.chunks = result.chunks.map((c) => ({
+          ...c,
+          content: stripReservedPromptTags(c.content),
+        }));
         // Prompt-injection defense: video-link-sourced chunks contain
         // attacker-controlled transcript text. Mirror the wrap that
         // `rag_search_tool.ts` applies on the agent-tool side.
@@ -146,10 +162,14 @@ export const ragAction: ActionDefinition<RagActionParams> = {
           }
 
           const result = await fetchJson<SearchResponse>(response);
-          // Prompt-injection defense: per-result content wrap for any
-          // file ids that map to a video-link source. Mirror the
-          // `rag_search_tool.ts` search-mode wrap.
-          let wrappedResults = result.results;
+          // SEC1: strip reserved wrapper tags from every search hit
+          // BEFORE further processing. Mirrors `rag_search_tool.ts:483`
+          // (agent-tool search path). The subsequent video-link
+          // `wrapUntrusted` layers on top of the stripped content.
+          let wrappedResults = result.results.map((r) => ({
+            ...r,
+            content: stripReservedPromptTags(r.content),
+          }));
           if (wrappedResults.length > 0) {
             const fileIds = wrappedResults
               .map((r) => r.file_id)

From 97917fdc04ebc080d5ed476b00e34bb56a314532 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:37:21 +0800
Subject: [PATCH 12/41] fix(platform): workflow + transcription correctness
 fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P1-6, P1-7, P1-8, P1-9, P1-10, P1-11 from the multi-agent review.
Six related correctness bugs in the file_metadata / agent_response /
documents modules: retry classifier inverted in the wrong direction,
no single-flight on transcription, no source-state preconditions on
the user-facing retry/skip mutations, a stuck-transcription watchdog
that mis-kills freshly-retried runs, an abort-watcher leak on
guardrails-block early returns, and two documents-generation paths
that skipped the typed-error migration.

P1-7 — `extractFileMetadata` (`file_metadata/internal_actions.ts:206`).
The retry classifier read `isRetryable || !isUpstreamHttpError(error)`
which meant every non-UpstreamHttpError throw was treated as transient.
`orgSlugFromId` failures, malformed JSON, "Invalid response shape" —
all got rescheduled 3× before failing. Inverted: default permanent,
opt in only on `UpstreamHttpError && retryable`. Trade-off accepts
that a genuine network blip surfaces as permanent rather than self-
healing — the original deterministic-error retry storms were far more
damaging.

P1-8 / P1-9 / P1-10 — single-flight + state gates + watchdog key.
`transcribeAudio` had no atomic lock: two concurrent invocations on
the same storageId (retryTranscription double-click, scheduled retry
+ user-triggered retry) both proceeded. Double Whisper bill, double
`+=` on `recordTranscriptionUsage`, double RAG index.

  - New `acquireTranscriptionLock` mutation writes
    `transcriptionRunId`, `transcriptionLeaseExpiresAt`,
    `transcriptionStartedAt` atomically under the existing
    `by_storageId` index; returns the runId on win or null on loss.
  - `transcribeAudio` acquires the lock first; on loss it logs
    `transcription.deduplicated` and returns without compress /
    Whisper / ledger work.
  - Lock is released in the `finally` block via the new
    `releaseTranscriptionLock` mutation, which no-ops if a watchdog
    or another retry already claimed it.

`retryTranscription` and `skipTranscription` now precondition on
source state — retry only from `failed`/`skipped`, skip only from
`queued`/`running`. The pre-existing bug allowed Skip after
`completed` to clobber the transcript and cascade videoLinkJobs into
a failed state; Retry from `running` would have double-billed
Whisper (now blocked by the lock too, but the UI shouldn't surface
a Retry button for an in-flight row anyway). `skipTranscription`
now routes through `updateFileTranscription` so the videoLinkJobs
cascade (internal_mutations.ts:319-345) fires correctly for video-
link audio; the prior direct `db.patch` orphaned the linked job at
`transcribing_handoff`.

`recoverStuckTranscriptions` (the 5-min cron) keyed on
`row._creationTime` so a `retryTranscription` against an old
fileMetadata row could be killed within seconds of the next tick.
Now keys on `transcriptionStartedAt ?? _creationTime` with legacy-
row fallback, and clears the single-flight fields when breaking the
lock so a re-retry can acquire cleanly.

Schema: three new optional fields on `fileMetadata` —
`transcriptionRunId`, `transcriptionLeaseExpiresAt`,
`transcriptionStartedAt`.

P1-11 — abort watcher leak. `generate_response.ts:1166-1184` and
`:1254-1276` are guardrails-block early returns that skipped the
`abortWatcher?.stop()` call every other return path makes. Bounded
leak (the polling closure self-terminates within ~1.5s on
`streams.abort`), but it kept issuing redundant `check_cancelled`
Convex queries after the function returned. Mirror the canonical
stop call at line 580 (cancelledReturn) before each
`return buildBlockedReturn(...)`.

P1-6 — `documents/generate_document.ts` and `generate_docx.ts`
skipped the UpstreamHttpError migration. Crawler-side errors now
route through `UpstreamHttpError.fromResponse('crawler', ...)` so
the body snippet is sanitised, retryability is classified by
status, and the agent boundary sees the safe message instead of
raw upstream text. Storage-upload paths (Convex `_storage` via
`generateUploadUrl`) sit outside the `'rag' | 'crawler'` service
union; those throws are downgraded to a status-only Error after
scrubbing the response body via `sanitizeError`.

Verified: `bun run check`: 36/36 tasks green (70943 platform tests).
---
 .../convex/documents/generate_document.ts     | 27 ++++--
 .../convex/documents/generate_docx.ts         | 23 +++--
 .../convex/file_metadata/internal_actions.ts  | 21 ++---
 .../file_metadata/internal_mutations.ts       | 83 ++++++++++++++++++-
 .../convex/file_metadata/mutations.ts         | 47 ++++++++++-
 .../platform/convex/file_metadata/schema.ts   | 18 ++++
 .../convex/file_metadata/transcribe_audio.ts  | 37 ++++++++-
 .../lib/agent_response/generate_response.ts   |  9 ++
 8 files changed, 236 insertions(+), 29 deletions(-)

diff --git a/services/platform/convex/documents/generate_document.ts b/services/platform/convex/documents/generate_document.ts
index a26e637d30..2c4ffb5dd6 100644
--- a/services/platform/convex/documents/generate_document.ts
+++ b/services/platform/convex/documents/generate_document.ts
@@ -10,7 +10,9 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { createDebugLog } from '../lib/debug_log';
+import { UpstreamHttpError } from '../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { sanitizeError } from '../lib/utils/sanitize_secrets';
 import {
   buildDownloadUrl,
   buildRequestBody,
@@ -66,13 +68,15 @@ export async function generateDocument(
 
   if (!response.ok) {
     const errorText = await response.text().catch(() => '');
-    console.error('[documents.generateDocument] crawler error', {
-      status: response.status,
-      statusText: response.statusText,
+    // Route through the typed wrapper: sanitises the body snippet (no
+    // raw upstream payload in logs), classifies retryable vs permanent
+    // by status, and presents a safe message to the agent boundary
+    // without leaking the crawler's response details.
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
       errorText,
-    });
-    throw new Error(
-      `Crawler generateDocument failed: ${response.status} ${response.statusText}`,
+      endpointPath,
     );
   }
 
@@ -96,13 +100,20 @@ export async function generateDocument(
 
   if (!uploadResponse.ok) {
     const uploadErrorText = await uploadResponse.text().catch(() => '');
+    // Storage upload (Convex `_storage` via `generateUploadUrl`) is
+    // neither rag nor crawler — `UpstreamHttpError`'s service union
+    // doesn't cover it. Scrub via `sanitizeError` before logging so a
+    // signed URL or other secret in the response body can't leak to
+    // logs, then throw a generic Error with status only.
     console.error('[documents.generateDocument] upload error', {
       status: uploadResponse.status,
       statusText: uploadResponse.statusText,
-      uploadErrorText,
+      // The Convex storage response body should be terse JSON, but
+      // sanitize anyway as defense in depth.
+      errorText: sanitizeError(uploadErrorText, 400),
     });
     throw new Error(
-      `Failed to upload generated document: ${uploadResponse.status} ${uploadResponse.statusText}`,
+      `Failed to upload generated document: HTTP ${uploadResponse.status}`,
     );
   }
 
diff --git a/services/platform/convex/documents/generate_docx.ts b/services/platform/convex/documents/generate_docx.ts
index 32d4b5c96e..2f127e4928 100644
--- a/services/platform/convex/documents/generate_docx.ts
+++ b/services/platform/convex/documents/generate_docx.ts
@@ -12,7 +12,9 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { createDebugLog } from '../lib/debug_log';
+import { UpstreamHttpError } from '../lib/errors/upstream_http_error';
 import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { sanitizeError } from '../lib/utils/sanitize_secrets';
 import { buildDownloadUrl, getCrawlerUrl } from './generate_document_helpers';
 
 const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]');
@@ -85,11 +87,12 @@ export async function generateDocx(
 
   if (!response.ok) {
     const errorText = await response.text().catch(() => '');
-    console.error('[documents.generateDocx] crawler error', {
-      status: response.status,
+    throw UpstreamHttpError.fromResponse(
+      'crawler',
+      response,
       errorText,
-    });
-    throw new Error(`Crawler generateDocx failed: ${response.status}`);
+      '/api/v1/docx',
+    );
   }
 
   const result = await response.json();
@@ -112,7 +115,17 @@ export async function generateDocx(
   });
 
   if (!uploadResponse.ok) {
-    throw new Error(`Failed to upload DOCX: ${uploadResponse.status}`);
+    const uploadErrorText = await uploadResponse.text().catch(() => '');
+    // Storage upload (Convex `_storage`) is not in the UpstreamHttpError
+    // service union; scrub body via sanitizeError before logging so any
+    // signed URL or token in the response can't leak. Throw a status-
+    // only error to the caller.
+    console.error('[documents.generateDocx] upload error', {
+      status: uploadResponse.status,
+      statusText: uploadResponse.statusText,
+      errorText: sanitizeError(uploadErrorText, 400),
+    });
+    throw new Error(`Failed to upload DOCX: HTTP ${uploadResponse.status}`);
   }
 
   const { storageId } = await fetchJson<{ storageId: Id<'_storage'> }>(
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index 9c62488688..ac2db60a55 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -194,16 +194,17 @@ export const extractFileMetadata = internalAction({
         }
       } catch (error) {
         const message = error instanceof Error ? error.message : String(error);
-        // Classify the failure: only schedule retries when the upstream
-        // (crawler) reported a status the abstraction marks retryable
-        // (5xx / 408 / 429). 4xx classes (org-slug lookup failure,
-        // missing file, malformed payload) are permanent — retrying
-        // burns scheduler slots without progress.
-        const isRetryable = isUpstreamHttpError(error) && error.retryable;
-        // Non-UpstreamHttpError throws (e.g. network reset, blob fetch
-        // failure before we even hit the crawler) are also assumed
-        // transient — we have no signal otherwise.
-        const isTransient = isRetryable || !isUpstreamHttpError(error);
+        // Default to permanent. Only the upstream `UpstreamHttpError` path
+        // gives us a positive retry signal (`retryable` set from
+        // 5xx/408/429 classification). Everything else — `orgSlugFromId`
+        // throws, malformed JSON, "Invalid response shape", blob fetch
+        // failures before we even hit the crawler — is treated as
+        // non-transient: retrying 3× burns scheduler slots without
+        // progress. The trade-off (a genuine network blip surfaces as
+        // a permanent failure instead of self-healing) is acceptable
+        // because the original deterministic-error retry storms were
+        // far more damaging.
+        const isTransient = isUpstreamHttpError(error) && error.retryable;
         console.error(
           `[extractFileMetadata] Error for file ${args.storageId} (attempt ${attempt}, transient=${isTransient}): ${message}`,
         );
diff --git a/services/platform/convex/file_metadata/internal_mutations.ts b/services/platform/convex/file_metadata/internal_mutations.ts
index 42c8f250fd..46b94f1fc9 100644
--- a/services/platform/convex/file_metadata/internal_mutations.ts
+++ b/services/platform/convex/file_metadata/internal_mutations.ts
@@ -372,11 +372,89 @@ export const updateFileVisionMetadata = internalMutation({
   },
 });
 
+/**
+ * Atomic single-flight lock for the `transcribeAudio` action. Two
+ * concurrent invocations on the same storageId (e.g. a `retryTranscription`
+ * double-click) used to both proceed: double Whisper bill, double `+=`
+ * ledger write in `recordTranscriptionUsage`, double RAG index. Now the
+ * second caller sees an active lease and short-circuits.
+ *
+ * Returns the active `transcriptionRunId` (string) when this caller wins
+ * the race, or `null` when another invocation is in flight (caller MUST
+ * return without doing any work — no compress, no Whisper, no ledger).
+ *
+ * Stamps `transcriptionStartedAt` so the watchdog can distinguish
+ * freshly-retried runs from genuinely-stuck legacy rows. Pre-existing
+ * `_creationTime`-keyed watchdog could kill a freshly-retried old row
+ * within seconds of starting.
+ */
+export const acquireTranscriptionLock = internalMutation({
+  args: {
+    storageId: v.id('_storage'),
+    runId: v.string(),
+    leaseMs: v.number(),
+  },
+  returns: v.union(v.string(), v.null()),
+  handler: async (ctx, args) => {
+    const row = await ctx.db
+      .query('fileMetadata')
+      .withIndex('by_storageId', (q) => q.eq('storageId', args.storageId))
+      .first();
+    if (!row) return null;
+
+    const now = Date.now();
+    const leaseHeld =
+      typeof row.transcriptionLeaseExpiresAt === 'number' &&
+      row.transcriptionLeaseExpiresAt > now &&
+      row.transcriptionStatus === 'running';
+    if (leaseHeld) return null;
+
+    await ctx.db.patch(row._id, {
+      transcriptionStatus: 'running',
+      transcriptionRunId: args.runId,
+      transcriptionLeaseExpiresAt: now + args.leaseMs,
+      transcriptionStartedAt: now,
+      transcriptionProgress: 'starting',
+    });
+    return args.runId;
+  },
+});
+
+/**
+ * Release the single-flight lock IFF the supplied `runId` matches the
+ * row's current `transcriptionRunId`. Other concurrent callers (or a
+ * watchdog that broke the lock) leave the field alone.
+ */
+export const releaseTranscriptionLock = internalMutation({
+  args: {
+    storageId: v.id('_storage'),
+    runId: v.string(),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db
+      .query('fileMetadata')
+      .withIndex('by_storageId', (q) => q.eq('storageId', args.storageId))
+      .first();
+    if (!row || row.transcriptionRunId !== args.runId) return null;
+    await ctx.db.patch(row._id, {
+      transcriptionRunId: undefined,
+      transcriptionLeaseExpiresAt: undefined,
+    });
+    return null;
+  },
+});
+
 /**
  * Watchdog: sweep fileMetadata rows stuck in `transcriptionStatus: 'running'`
  * for >35 minutes. Convex hard-kills actions at the 30-min timeout without
  * running their catch blocks, so without this sweep the send-gate would stay
  * locked forever for the affected uploads. Scheduled from crons.ts.
+ *
+ * Keyed on `transcriptionStartedAt ?? _creationTime` (round-2 P1-10) so a
+ * `retryTranscription` against an old fileMetadata row doesn't get killed
+ * within seconds by the next 5-min tick. Legacy rows without the new
+ * field fall back to `_creationTime`.
  */
 export const recoverStuckTranscriptions = internalMutation({
   args: {},
@@ -391,10 +469,13 @@ export const recoverStuckTranscriptions = internalMutation({
       .withIndex('by_transcriptionStatus', (q) =>
         q.eq('transcriptionStatus', 'running'),
       )) {
-      if (row._creationTime < cutoff) {
+      const startedAt = row.transcriptionStartedAt ?? row._creationTime;
+      if (startedAt < cutoff) {
         await ctx.db.patch(row._id, {
           transcriptionStatus: 'failed',
           transcriptionError: 'Transcription timed out (watchdog)',
+          transcriptionRunId: undefined,
+          transcriptionLeaseExpiresAt: undefined,
         });
         // Cascade the failure back to the owning videoLinkJobs row when
         // present. Without this, a videoLinkJob stuck at
diff --git a/services/platform/convex/file_metadata/mutations.ts b/services/platform/convex/file_metadata/mutations.ts
index 1c8d9a8167..3bc9ed0a9a 100644
--- a/services/platform/convex/file_metadata/mutations.ts
+++ b/services/platform/convex/file_metadata/mutations.ts
@@ -248,10 +248,32 @@ export const skipTranscription = mutation({
       throw new Error('Not authorized');
     }
 
-    await ctx.db.patch(metadata._id, {
-      transcriptionStatus: 'skipped',
-      transcriptionError: 'User skipped transcription',
-    });
+    // Source-state precondition. `skipTranscription` is the Skip button
+    // the chat composer shows after 60s of `running` — it only makes
+    // sense to skip a transcription that's actively in flight or queued.
+    // Skipping a `completed` row would clobber the transcript and
+    // cascade `videoLinkJobs` into a failed state for a successful run.
+    if (
+      metadata.transcriptionStatus !== 'queued' &&
+      metadata.transcriptionStatus !== 'running'
+    ) {
+      throw new Error(
+        `Cannot skip transcription in status "${metadata.transcriptionStatus ?? 'none'}" — only queued or running.`,
+      );
+    }
+
+    // Route through `updateFileTranscription` so the `videoLinkJobs`
+    // cascade at internal_mutations.ts:319-345 fires correctly for
+    // video-link audio (Whisper branch). Direct `db.patch` here would
+    // leave the linked job stuck at `transcribing_handoff` forever.
+    await ctx.runMutation(
+      internal.file_metadata.internal_mutations.updateFileTranscription,
+      {
+        storageId: args.storageId,
+        transcriptionStatus: 'skipped',
+        transcriptionError: 'User skipped transcription',
+      },
+    );
   },
 });
 
@@ -280,9 +302,26 @@ export const retryTranscription = mutation({
       throw new Error('Not authorized');
     }
 
+    // Source-state precondition. Retry is only valid from a terminal
+    // failure state — retrying `running` would double-bill Whisper
+    // (single-flight gate catches it now, but the UI still shouldn't
+    // surface a "Retry" button for an in-flight row); retrying
+    // `completed` would clobber the existing transcript.
+    if (
+      metadata.transcriptionStatus !== 'failed' &&
+      metadata.transcriptionStatus !== 'skipped'
+    ) {
+      throw new Error(
+        `Cannot retry transcription in status "${metadata.transcriptionStatus ?? 'none'}" — only failed or skipped.`,
+      );
+    }
+
     await ctx.db.patch(metadata._id, {
       transcriptionStatus: 'queued',
       transcriptionError: undefined,
+      // Clear the single-flight lock so transcribeAudio can re-acquire.
+      transcriptionRunId: undefined,
+      transcriptionLeaseExpiresAt: undefined,
     });
 
     await ctx.scheduler.runAfter(
diff --git a/services/platform/convex/file_metadata/schema.ts b/services/platform/convex/file_metadata/schema.ts
index 7cf09b2f61..0dc3701c1e 100644
--- a/services/platform/convex/file_metadata/schema.ts
+++ b/services/platform/convex/file_metadata/schema.ts
@@ -55,6 +55,24 @@ export const fileMetadataTable = defineTable({
   // Human-readable progress hint while transcriptionStatus is 'running'
   // (e.g. "compressing", "transcribing chunk 2 of 4"). Cleared on completion.
   transcriptionProgress: v.optional(v.string()),
+  // Single-flight lock for the `transcribeAudio` action. Set atomically
+  // by `acquireTranscriptionLock` when status transitions queued/null →
+  // running. Concurrent invocations on the same storageId (e.g. a
+  // `retryTranscription` double-click) check the run id under a lease
+  // window and short-circuit if another invocation is in flight.
+  // Cleared on completion / final failure / lease expiry.
+  transcriptionRunId: v.optional(v.string()),
+  // Unix ms; transcribeAudio re-acquisition is gated until this point.
+  // `recoverStuckTranscriptions` (watchdog) breaks the lock once the
+  // lease expires AND there's been no progress.
+  transcriptionLeaseExpiresAt: v.optional(v.number()),
+  // Unix ms when transcriptionStatus most recently flipped to 'running'
+  // (stamped by acquireTranscriptionLock). Used by the watchdog to
+  // detect stuck transcriptions without confusing a freshly-retried row
+  // — `_creationTime` alone caused fresh retries of old rows to be
+  // killed within seconds. Falls back to `_creationTime` for legacy
+  // rows that pre-date this field.
+  transcriptionStartedAt: v.optional(v.number()),
   // RAG indexing of the transcript (separate from ragStatus above, which is
   // gated out at scheduling time for audio uploads — see mutations).
   transcriptRagStatus: v.optional(
diff --git a/services/platform/convex/file_metadata/transcribe_audio.ts b/services/platform/convex/file_metadata/transcribe_audio.ts
index 7eb0ef96ac..dd65bff87d 100644
--- a/services/platform/convex/file_metadata/transcribe_audio.ts
+++ b/services/platform/convex/file_metadata/transcribe_audio.ts
@@ -315,12 +315,39 @@ export const transcribeAudio = internalAction({
       return null;
     }
 
+    // Single-flight gate. Two concurrent invocations on the same
+    // storageId (retryTranscription double-click, scheduled retry +
+    // user-triggered retry, etc.) used to both proceed: double Whisper
+    // bill, double `+=` ledger write, double RAG index. The lock holds
+    // for the action's 30-min hard timeout + a small grace so the
+    // watchdog can break it if Convex SIGKILLs us. If we lose the
+    // race, return without doing any work.
+    const TRANSCRIBE_LEASE_MS = 35 * 60 * 1000;
+    const acquired = await ctx.runMutation(
+      internal.file_metadata.internal_mutations.acquireTranscriptionLock,
+      {
+        storageId: args.storageId,
+        runId: requestId,
+        leaseMs: TRANSCRIBE_LEASE_MS,
+      },
+    );
+    if (acquired !== requestId) {
+      console.log(
+        JSON.stringify({
+          event: 'transcription.deduplicated',
+          requestId,
+          storageId: args.storageId,
+          attempt,
+        }),
+      );
+      return null;
+    }
+
     try {
       await ctx.runMutation(
         internal.file_metadata.internal_mutations.updateFileTranscription,
         {
           storageId: args.storageId,
-          transcriptionStatus: 'running',
           transcriptionProgress: 'checking',
         },
       );
@@ -659,6 +686,14 @@ export const transcribeAudio = internalAction({
       if (chunked) {
         await chunked.cleanup();
       }
+      // Release the single-flight lock IFF this invocation still owns
+      // it (the watchdog may have broken it on lease expiry; another
+      // retry could also have claimed it after status flipped). The
+      // mutation no-ops on mismatch.
+      await ctx.runMutation(
+        internal.file_metadata.internal_mutations.releaseTranscriptionLock,
+        { storageId: args.storageId, runId: requestId },
+      );
     }
   },
 });
diff --git a/services/platform/convex/lib/agent_response/generate_response.ts b/services/platform/convex/lib/agent_response/generate_response.ts
index edfb61baa2..9c85eec377 100644
--- a/services/platform/convex/lib/agent_response/generate_response.ts
+++ b/services/platform/convex/lib/agent_response/generate_response.ts
@@ -1174,6 +1174,12 @@ export async function generateAgentResponse(
           result.text = OUTPUT_BLOCKED_SENTINEL;
           // Skip the empty-output-provider-error heuristic below: empty
           // text is now expected.
+          // Match the sibling success/catch return paths (`:580`, `:2008`,
+          // `:2011`) which all stop the abort watcher before returning.
+          // Without this the polling closure keeps issuing redundant
+          // Convex `check_cancelled` queries for up to one ABORT_POLL_INTERVAL
+          // (~1.5s) after the function has returned.
+          abortWatcher?.stop();
           return buildBlockedReturn(
             threadId,
             savedMessageId,
@@ -1264,6 +1270,9 @@ export async function generateAgentResponse(
                 blockedReason,
               );
               result.text = OUTPUT_BLOCKED_SENTINEL;
+              // Sibling parity with the mid-stream guardrails-block path
+              // and the success/catch returns — see note above.
+              abortWatcher?.stop();
               return buildBlockedReturn(
                 threadId,
                 savedMessageId,

From 20aec34619494c0c07c34aed4039742373f9ae1c Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 21:57:38 +0800
Subject: [PATCH 13/41] fix(platform,cli): auth + scaffold + deploy data-safety
 fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P1-12, P1-13, P1-14, P1-31, P1-32, P1-33, P1-34 from the multi-
agent review. Seven small but data-loss-class bugs across the auth
hooks, scaffold janitor, deploy CLI, and tale init/update flow.

P1-12 — `beforeUpdateOrganization`'s slug collision check at auth.ts:
654-667 didn't exclude the org being updated. Better Auth's own pre-
check at crud-org.mjs:213-215 does this self-exclude; without
mirroring it, any update payload that re-sends the current slug
(e.g. a name-only PATCH that round-trips the full object, or a UI
form that posts the full org state) 400s with "already taken".
Fixed by reading `data.member.organizationId` and treating a
collision against the same id as a no-op.

P1-13 — `beforeCreateOrganization` at auth.ts:632-636 swallowed
an empty catch around `data.organization.slug = normalizedSlug`.
If the assignment ever threw (frozen object, etc.) the un-normalized
caller-supplied slug would persist while the reservation + unique-
ness checks above had run against the normalized version — defeating
the very normalization the comment at line 580-585 defends against.
Replaced with the cleaner Record<string, unknown> cast pattern that
`beforeUpdateOrganization` already uses; if the assignment ever
throws the create fails loudly rather than persisting the wrong
slug.

P1-14 — `scaffold.ts` bundle-mode (this file's seedSingleDomain
line ~335) and skills uploads (skills/file_actions.ts:706-707) stage
into `<bundle>.staging-<8hex>` / `.replacing-<8hex>` siblings before
atomic-renaming onto the target. Process crash mid-stage leaves
orphans 3 levels deep at `<root>/<org>/<domain>/`. The pre-existing
`sweepStaleCondemnedDirs` janitor only walked root-level
`.deleted-*`, so the orphans would (a) survive forever and (b) make
`dirHasFiles` return true → next `override:false` reseed skips that
domain indefinitely.

  - Renamed `isAtomicWriteTmp` → `isTransientArtifact` and taught
    it to match `\.{staging,replacing}-[a-f0-9]{8}$`. `dirHasFiles`
    now ignores these orphans the same way it ignored `.tmp`.
  - Rewrote `sweepStaleCondemnedDirs` to walk 3 levels (root →
    `<org>` → `<domain>`) and rm 24h-old transient siblings, in
    addition to the existing root-level `.deleted-*` sweep. Skips
    non-validated org dirs and symlinks; per-entry errors only log.
  - Wired the janitor into `scaffoldNewOrganization` (was only
    called from `cleanupOrgFilesystem`) so reseed paths sweep too.

P1-31 — `tale deploy --override` + `--override-all` together is
nonsense: host push runs first, then the catalog factory reseed
clobbers everything --override would have written. Operators were
hitting this combination and reasoning about a silently-discarded
flag. Reject the combination at commander parse time with a
diagnostic that explains the two modes are mutually exclusive.

P1-32 — entry-time legacy-flat-layout check ran unconditionally
inside `withLock`, before any compose action. Plain `tale deploy`
(container rotation; no host push) has no host-push hazard, so the
duplicate check trapped operators with leftover legacy artifacts
who just wanted to roll containers. Gated on
`options.override || options.overrideAll`; the host-push code path
at syncProjectFiles enforces the same check where it matters.

P1-33 — `LEGACY_DOMAIN_DIR_NAMES` (deploy.ts) blocked operators
with legitimately-named org slugs (`agents`, `workflows`, `branding`,
`providers`, `skills`, `integrations`, `retention`) at deploy time
— but `reserved-org-slugs.ts` reserved only `default`, so the UI
happily created those orgs in the first place. The CLI then
classified the org's `<root>/<orgSlug>/` dir as a legacy artifact
and refused to deploy; the error message recommended
`tale migrate config-layout`, which would silently merge that
org's data under `default/`. Data-loss risk.

Fix: move the legacy-domain set into `reserved-org-slugs.ts` so the
UI form's zod refine (organization-form.tsx:73) and Better Auth's
beforeCreate/beforeUpdate hooks all refuse these names up front.
The CLI's `LEGACY_DOMAIN_DIR_NAMES` stays where it is (different
package, hard to share Convex code with), with a comment to keep
the two sets in lockstep.

P1-34 — `tale init` recorded checksums for example files but NOT
for the four AI rules files (CLAUDE.md, .cursor/rules/tale.mdc,
.github/copilot-instructions.md, .windsurfrules). The rules were
written AFTER `writeChecksums`, so the first `tale update` after
init saw `oldHash === undefined` for each and hit the unconditional
"new" branch at update.ts:95-101 — silently clobbering any local
edits the user made between init and that first update.

  - init.ts: moved the rules-file write loop ABOVE the checksum
    construction, and added each rules file to `allFiles`.
  - update.ts: defense-in-depth — the `!oldHash` branch now also
    checks `!existsSync(destPath)`. If the file is present on disk
    but absent from checksums.json (legacy projects init'd by the
    pre-fix CLI), treat it as locally modified: preserve unless
    `--force`, with a warning.

Verified: `bun run check`: 36/36 tasks green; scaffold's 24 tests
still pass after the janitor rewrite.
---
 services/platform/convex/auth.ts              |  32 +++--
 .../platform/convex/organizations/scaffold.ts | 135 +++++++++++++++---
 .../shared/constants/reserved-org-slugs.ts    |  37 ++++-
 tools/cli/src/commands/deploy/index.ts        |  16 +++
 tools/cli/src/lib/actions/deploy.ts           |  15 +-
 tools/cli/src/lib/actions/init.ts             |  33 +++--
 tools/cli/src/lib/actions/update.ts           |  21 ++-
 7 files changed, 238 insertions(+), 51 deletions(-)

diff --git a/services/platform/convex/auth.ts b/services/platform/convex/auth.ts
index 8158ba349b..542592128b 100644
--- a/services/platform/convex/auth.ts
+++ b/services/platform/convex/auth.ts
@@ -626,14 +626,15 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
               });
             }
             // Project the normalized slug back so the persisted row
-            // matches what the checks just used. If the field is
-            // read-only on `data.organization`, this is a defensive
-            // no-op — callers are still expected to submit lowercase.
-            try {
-              data.organization.slug = normalizedSlug;
-            } catch {
-              /* read-only field — caller-supplied slug stands */
-            }
+            // matches what the checks just used. Use the same loose-
+            // payload cast pattern as `beforeUpdateOrganization` below
+            // instead of a try/catch swallow — if the assignment ever
+            // throws (frozen object, etc.) it should surface, not
+            // silently fall back to the caller-supplied case (which
+            // would defeat the normalization the reservation + unique-
+            // ness checks just relied on).
+            (data.organization as Record<string, unknown>).slug =
+              normalizedSlug;
           },
           beforeUpdateOrganization: async (data) => {
             // Re-run the create-time guards on update: without this
@@ -660,7 +661,20 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
                 ],
               },
             );
-            if (collision) {
+            // Exclude self from collision: Better Auth's payload carries
+            // `data.member.organizationId` (the org being updated). Its
+            // own pre-check at crud-org.mjs:213-215 does this same self-
+            // exclude; without mirroring it here, any update that re-
+            // sends the current slug (e.g. a name-only PATCH that
+            // round-trips the full object) 400s with "already taken".
+            const selfOrgId = (
+              data.member as { organizationId?: unknown } | undefined
+            )?.organizationId;
+            const collisionIsSelf =
+              typeof selfOrgId === 'string' &&
+              isRecord(collision) &&
+              getString(collision, '_id') === selfOrgId;
+            if (collision && !collisionIsSelf) {
               throw new APIError('BAD_REQUEST', {
                 message: `Organization slug "${normalizedSlug}" is already taken.`,
               });
diff --git a/services/platform/convex/organizations/scaffold.ts b/services/platform/convex/organizations/scaffold.ts
index 61e47391c0..607a466b8c 100644
--- a/services/platform/convex/organizations/scaffold.ts
+++ b/services/platform/convex/organizations/scaffold.ts
@@ -103,18 +103,23 @@ function shouldSkipFile(name: string): boolean {
   return SKIP_FILE_SUFFIXES.some((s) => name.endsWith(s));
 }
 
-// atomicWrite leaves `.<basename>.<ts>.<uuid>.tmp` orphans on crash. Those
-// shouldn't lock out a retry, but every other entry (including dotfiles
-// like `.history/` that agents/workflows write on every edit) means a user
-// has been here and we must not overwrite in the non-override path.
-function isAtomicWriteTmp(name: string): boolean {
-  return name.startsWith('.') && name.endsWith('.tmp');
+// atomicWrite leaves `.<basename>.<ts>.<uuid>.tmp` orphans on crash. Bundle-
+// mode scaffolds (this file) and skills uploads (skills/file_actions.ts)
+// stage into `<basename>.staging-<8hex>` / `<basename>.replacing-<8hex>`
+// dirs that are atomic-renamed onto the target. None of these are user-
+// authored content, so a leftover from a crash must not (a) lock out a
+// retry by making `dirHasFiles` return true and (b) make `override:false`
+// skip the whole domain indefinitely.
+const STAGING_SUFFIX_RE = /\.(staging|replacing)-[a-f0-9]{8}$/;
+function isTransientArtifact(name: string): boolean {
+  if (name.startsWith('.') && name.endsWith('.tmp')) return true;
+  return STAGING_SUFFIX_RE.test(name);
 }
 
 async function dirHasFiles(dir: string): Promise<boolean> {
   try {
     const entries = await readdir(dir);
-    return entries.some((n) => !isAtomicWriteTmp(n));
+    return entries.some((n) => !isTransientArtifact(n));
   } catch (err) {
     if (errnoCode(err) !== 'ENOENT') {
       console.warn('[scaffold.dirHasFiles] readdir failed:', dir, err);
@@ -452,33 +457,113 @@ async function seedRetention(
 }
 
 /**
- * Best-effort opportunistic sweep of `.deleted-*` siblings older than
- * 24h that survived a prior failed `rm`. Called at the top of
- * `cleanupOrgFilesystem`. Errors are swallowed (the main op shouldn't
- * fail because of a leftover dir we couldn't clean).
+ * Best-effort opportunistic sweep of orphan transient dirs older than
+ * 24h that survived a prior failed `rm` or process crash:
+ *
+ *   - Root-level `<root>/.deleted-*` (left by the two-phase rename-then-
+ *     delete in `cleanupOrgFilesystem`).
+ *   - Nested `<root>/<org>/<domain>/<bundle>.staging-<8hex>` and
+ *     `.replacing-<8hex>` (left by `seedSingleDomain`'s bundle mode here,
+ *     and by `skills/file_actions.ts:706-707` uploadSkillBundle). Without
+ *     this, an orphan staging dir would make `dirHasFiles` return true
+ *     and the next `override:false` scaffold would skip the whole domain
+ *     indefinitely.
+ *
+ * Errors are swallowed per-entry (the main op shouldn't fail because of a
+ * leftover dir we couldn't clean). Called from both `cleanupOrgFilesystem`
+ * and `scaffoldNewOrganization` so reseed paths sweep too.
  */
 const CONDEMNED_TTL_MS = 24 * 60 * 60 * 1000;
 async function sweepStaleCondemnedDirs(root: string): Promise<void> {
-  let entries: string[];
+  let rootEntries: string[];
   try {
-    entries = await readdir(root);
+    rootEntries = await readdir(root);
   } catch (err) {
     if (errnoCode(err) === 'ENOENT') return;
     throw err;
   }
+
   const now = Date.now();
-  for (const name of entries) {
-    if (!name.startsWith('.deleted-')) continue;
-    const p = path.join(root, name);
-    const info = await lstat(p).catch(() => null);
-    if (!info || info.isSymbolicLink()) continue;
-    if (now - info.mtimeMs < CONDEMNED_TTL_MS) continue;
+
+  const tryRm = async (p: string): Promise<void> => {
     await rm(p, { recursive: true }).catch((err) => {
       console.warn(
-        `[cleanupOrgFilesystem] janitor: rm ${p} failed:`,
+        `[scaffold.janitor] rm ${p} failed:`,
         err instanceof Error ? err.message : err,
       );
     });
+  };
+
+  for (const orgEntry of rootEntries) {
+    const orgPath = path.join(root, orgEntry);
+
+    // Root-level `.deleted-*` orphan from cleanupOrgFilesystem.
+    if (orgEntry.startsWith('.deleted-')) {
+      const info = await lstat(orgPath).catch(() => null);
+      if (!info || info.isSymbolicLink()) continue;
+      if (now - info.mtimeMs < CONDEMNED_TTL_MS) continue;
+      await tryRm(orgPath);
+      continue;
+    }
+
+    // Skip non-org dotdirs at root and ignore non-directories. Org slugs
+    // must validate against the same regex used to scaffold them, so we
+    // don't accidentally recurse into a stray bind-mount.
+    if (orgEntry.startsWith('.')) continue;
+    if (!validateOrgSlug(orgEntry)) continue;
+    const orgInfo = await lstat(orgPath).catch(() => null);
+    if (!orgInfo || !orgInfo.isDirectory() || orgInfo.isSymbolicLink()) {
+      continue;
+    }
+
+    let domainEntries: string[];
+    try {
+      domainEntries = await readdir(orgPath);
+    } catch (err) {
+      if (errnoCode(err) !== 'ENOENT') {
+        console.warn(
+          '[scaffold.janitor] readdir org dir failed:',
+          orgPath,
+          err,
+        );
+      }
+      continue;
+    }
+
+    for (const domainName of domainEntries) {
+      const domainPath = path.join(orgPath, domainName);
+      const domainInfo = await lstat(domainPath).catch(() => null);
+      if (
+        !domainInfo ||
+        !domainInfo.isDirectory() ||
+        domainInfo.isSymbolicLink()
+      ) {
+        continue;
+      }
+
+      let leaves: string[];
+      try {
+        leaves = await readdir(domainPath);
+      } catch (err) {
+        if (errnoCode(err) !== 'ENOENT') {
+          console.warn(
+            '[scaffold.janitor] readdir domain dir failed:',
+            domainPath,
+            err,
+          );
+        }
+        continue;
+      }
+
+      for (const leaf of leaves) {
+        if (!STAGING_SUFFIX_RE.test(leaf)) continue;
+        const leafPath = path.join(domainPath, leaf);
+        const leafInfo = await lstat(leafPath).catch(() => null);
+        if (!leafInfo || leafInfo.isSymbolicLink()) continue;
+        if (now - leafInfo.mtimeMs < CONDEMNED_TTL_MS) continue;
+        await tryRm(leafPath);
+      }
+    }
   }
 }
 
@@ -661,6 +746,16 @@ export const scaffoldNewOrganization = internalAction({
       return { ok: false, skipped: true, results: [] };
     }
 
+    // Opportunistic janitor: sweep root-level `.deleted-*` AND nested
+    // `<org>/<domain>/<bundle>.staging-*` orphans older than 24h before
+    // any per-domain work. Without this, a bundle staging dir orphaned
+    // by a prior crash would make `dirHasFiles` return true and the
+    // domain's non-override seed would skip indefinitely (round-2 P1-14).
+    // Best-effort: errors only log.
+    await sweepStaleCondemnedDirs(configRoot).catch((err) => {
+      console.warn('[scaffoldNewOrganization] janitor sweep failed:', err);
+    });
+
     const catalogRoot = process.env[BUILTIN_ENV];
     const override = args.override ?? false;
 
diff --git a/services/platform/lib/shared/constants/reserved-org-slugs.ts b/services/platform/lib/shared/constants/reserved-org-slugs.ts
index 17a5554bce..621eaf6aa3 100644
--- a/services/platform/lib/shared/constants/reserved-org-slugs.ts
+++ b/services/platform/lib/shared/constants/reserved-org-slugs.ts
@@ -2,17 +2,42 @@
  * Org slugs that the platform reserves and refuses to assign to
  * user-created organizations.
  *
- * `default` is reserved because the platform pins several global
- * resources to it (branding, retention defaults, scaffold seed
- * target). If a user could claim that slug they'd inherit those
- * globals, including the ability to mutate platform branding via
- * `isCallerAdmin` (see `convex/branding/internal_queries.ts`).
+ * Two classes:
+ *
+ * 1. **Platform-pinned globals.** `default` is reserved because the
+ *    platform pins several global resources to it (branding, retention
+ *    defaults, scaffold seed target). If a user could claim that slug
+ *    they'd inherit those globals, including the ability to mutate
+ *    platform branding via `isCallerAdmin` (see
+ *    `convex/branding/internal_queries.ts`).
+ *
+ * 2. **Legacy per-domain directory names** — the names a pre-refactor
+ *    `tale init` would have created at the project root (`agents/`,
+ *    `workflows/`, …). The org-first refactor moved everything under
+ *    `<orgSlug>/<domain>/`, so a user-created org named `agents` would
+ *    produce a `<root>/agents/` directory that's indistinguishable from
+ *    a legacy artifact. `tale deploy`'s `findOrgDirs` classifies any
+ *    such top-level dir as a legacy artifact and refuses to push;
+ *    silently accepting the slug at create time would manufacture a
+ *    permanent deploy failure for that org. Round-2 P1-33: reject the
+ *    name up front so the UI never lets the operator into that state.
  *
  * Importable from both Convex (`convex/auth.ts`) and the React
  * organization form — kept in `lib/shared/constants/` so it stays
  * Node-runtime-neutral.
  */
-const RESERVED_ORG_SLUGS: ReadonlySet<string> = new Set(['default']);
+const RESERVED_ORG_SLUGS: ReadonlySet<string> = new Set([
+  'default',
+  // Legacy per-domain dirs (kept in lockstep with
+  // `tools/cli/src/lib/actions/deploy.ts:LEGACY_DOMAIN_DIR_NAMES`).
+  'agents',
+  'workflows',
+  'integrations',
+  'branding',
+  'providers',
+  'skills',
+  'retention',
+]);
 
 export function isReservedOrgSlug(slug: string): boolean {
   return RESERVED_ORG_SLUGS.has(slug.toLowerCase());
diff --git a/tools/cli/src/commands/deploy/index.ts b/tools/cli/src/commands/deploy/index.ts
index b96da2b814..dc567adee2 100644
--- a/tools/cli/src/commands/deploy/index.ts
+++ b/tools/cli/src/commands/deploy/index.ts
@@ -47,6 +47,22 @@ export function createDeployCommand(): Command {
     )
     .action(async (options) => {
       try {
+        // `--override` and `--override-all` are semantically incompatible:
+        // host push runs first, then the catalog factory reseed clobbers
+        // everything --override would have written (host push effectively
+        // becomes a no-op for non-secrets / non-history / non-branding-
+        // images). Reject the combination at parse time so operators
+        // don't reason about a silently-discarded flag.
+        if (options.override && options.overrideAll) {
+          logger.error(
+            '--override and --override-all cannot be combined: ' +
+              '--override-all factory-reseeds from the builtin catalog and ' +
+              'would clobber whatever --override just pushed. ' +
+              'Pick one: --override (push host workspace to container) ' +
+              'OR --override-all (factory-reseed all orgs server-side).',
+          );
+          process.exit(1);
+        }
         const projectDir = requireProject();
         await resolveOrAssignProjectContext(projectDir);
         const { success: envSetupSuccess, regeneratedAutoSecrets } =
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 9cfc2995d5..436d74f96c 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -146,12 +146,15 @@ export async function deploy(options: DeployOptions): Promise<void> {
       logger.header(`${prefix}Deploying Tale ${version}`);
 
       // Auto-migration framework removed — `tale migrate config-layout` is
-      // the only opt-in, manually-run migration now. Fail fast (before
-      // pulling images / rolling services) if the project still has the
-      // pre-refactor flat layout at the root; otherwise a no-op deploy
-      // could complete while the host config silently never reaches the
-      // container.
-      {
+      // the only opt-in, manually-run migration now. Fail fast on the
+      // pre-refactor flat layout — but ONLY when the operator is actually
+      // pushing host config (`--override` or `--override-all`). Plain
+      // `tale deploy` (container rotation, image pull only) has no host-
+      // push hazard, so trapping operators with legacy artifacts on a
+      // no-op deploy was over-broad. The host-push code path at
+      // syncProjectFiles enforces the same check where it matters
+      // (round-2 P1-32).
+      if (options.override || options.overrideAll) {
         const { legacyDirs } = await findOrgDirs(env.DEPLOY_DIR);
         if (legacyDirs.length > 0) {
           throw new Error(
diff --git a/tools/cli/src/lib/actions/init.ts b/tools/cli/src/lib/actions/init.ts
index c69c23fb67..f4e285aa1f 100644
--- a/tools/cli/src/lib/actions/init.ts
+++ b/tools/cli/src/lib/actions/init.ts
@@ -182,11 +182,32 @@ export async function init(options: InitOptions): Promise<void> {
   const skillFiles = getEmbeddedExamples('skills');
   await writeEmbeddedFiles(skillFiles, join(defaultOrgDir, 'skills'));
 
+  // Write AI rules files. Moved ABOVE the checksum step (was below,
+  // after writeChecksums) so the four rules files — CLAUDE.md,
+  // .cursor/rules/tale.mdc, .github/copilot-instructions.md,
+  // .windsurfrules — get hashed into `.tale/checksums.json` alongside
+  // the example files. Without the hash recorded, `tale update`'s
+  // `!oldHash` "new" branch (update.ts:95-101) hits unconditional
+  // overwrite on the FIRST run after init and silently clobbers any
+  // local edits the user made between init and that first update
+  // (round-2 P1-34).
+  logger.step('Writing AI rules files...');
+  const rulesFiles = generateAllRules();
+  for (const { relativePath, content } of rulesFiles) {
+    const destPath = join(target, relativePath);
+    await mkdir(dirname(destPath), { recursive: true });
+    await Bun.write(destPath, content);
+  }
+
   // Compute checksums. Paths are recorded relative to the project root,
-  // matching where the files actually live (default/<domain>/...).
+  // matching where the files actually live (default/<domain>/... and
+  // the rules files at the project root).
   logger.step('Computing file checksums...');
   const allFiles = new Map<string, string>();
 
+  for (const { relativePath, content } of rulesFiles) {
+    allFiles.set(relativePath, computeContentHash(content));
+  }
   for (const [relPath, content] of agentFiles) {
     allFiles.set(
       join('default', 'agents', relPath),
@@ -255,14 +276,8 @@ export async function init(options: InitOptions): Promise<void> {
   // framework. Existing projects' stale files are harmless and can be
   // deleted manually.)
 
-  // Write AI rules files
-  logger.step('Writing AI rules files...');
-  const rulesFiles = generateAllRules();
-  for (const { relativePath, content } of rulesFiles) {
-    const destPath = join(target, relativePath);
-    await mkdir(dirname(destPath), { recursive: true });
-    await Bun.write(destPath, content);
-  }
+  // (AI rules files are now written ABOVE the checksum step — see the
+  // `generateAllRules()` block earlier so their hashes are recorded.)
 
   // Ensure .gitignore
   await ensureGitignore(target);
diff --git a/tools/cli/src/lib/actions/update.ts b/tools/cli/src/lib/actions/update.ts
index 8d4161d363..a6c879c287 100644
--- a/tools/cli/src/lib/actions/update.ts
+++ b/tools/cli/src/lib/actions/update.ts
@@ -92,13 +92,32 @@ export async function update(options: UpdateOptions): Promise<void> {
     const newHash = computeContentHash(content);
     const oldHash = oldFiles[relativePath];
 
-    if (!oldHash) {
+    if (!oldHash && !existsSync(destPath)) {
       logger.info(`${prefix}+ ${relativePath} (new)`);
       if (!options.dryRun) {
         await mkdir(dirname(destPath), { recursive: true });
         await writeFile(destPath, content);
       }
       rulesUpdates[relativePath] = newHash;
+    } else if (!oldHash) {
+      // File present on disk but missing from checksums.json — treat
+      // as locally-modified (likely a project init'd by a pre-fix CLI
+      // version that wrote the rules files without recording their
+      // hashes). Preserve user edits; require --force to overwrite.
+      // Round-2 P1-34 defense in depth.
+      if (options.force) {
+        logger.warn(
+          `${prefix}~ ${relativePath} (overwritten, no recorded hash)`,
+        );
+        if (!options.dryRun) {
+          await writeFile(destPath, content);
+        }
+        rulesUpdates[relativePath] = newHash;
+      } else {
+        logger.warn(
+          `${prefix}! ${relativePath} (present on disk but no recorded hash; preserving — pass --force to overwrite)`,
+        );
+      }
     } else if (!existsSync(destPath)) {
       logger.info(`${prefix}- ${relativePath} (deleted by user, skipping)`);
     } else {

From 48fc1ce42db04914e784c9d763b67fc11bb106f5 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 22:09:54 +0800
Subject: [PATCH 14/41] fix(rag,crawler): lifecycle + LRU + dim contract
 hardening
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes P1-19, P1-20, P1-21, P1-22, P1-23, P1-24, P1-25, P1-26, P1-27,
and the P1-29 boot-dim test gap from the multi-agent review.

P1-19 — `RagService.shutdown()` had two gaps. New `_shutting_down`
flag is checked at the top of `_ensure_org_clients`; requests landing
between `_org_clients.clear()` and `await close_pool()` now raise
RuntimeError instead of repopulating the cache and binding to a
closing pool. The unbounded `gather(*_background_tasks)` drain (whose
underlying `_safe_close` coroutines each `asyncio.sleep(30)`) is now
wrapped in `asyncio.wait_for(_, timeout=10)`; on timeout the still-
pending tasks are cancelled so shutdown completes promptly.

P1-20 — `_get_org_lock` claimed bounded LRU in the comment but was
actually FIFO with no reordering. A busy org's lock could be evicted
while held by fiber A; fiber B then got a fresh lock and both raced
into `_build_or_refresh_org_clients` with `previous=None`, silently
overwriting each other's client set (no `_safe_close` scheduled
since the cleanup at line 289-302 only fires when `previous is not
None`). Switched `_org_locks` and `_org_clients` to `OrderedDict`
with `move_to_end` on every access; eviction scans for the LRU
*unheld* lock rather than blindly popping the head. `_org_clients`
is now also bounded by the same `_ORG_LOCKS_MAX`, with `_safe_close`
scheduled per evicted client.

P1-21 — new `tests/test_rag_service_concurrency.py` (5 cases)
locks the invariants: shutdown gate, drain timeout, LRU
move-to-end, eviction skips held locks, `_pin_dim_lock` first-write
race serialises across two concurrent dim-pinners.

P1-24 — `database.py` boot-time dim handling. Restored the legible
"dimension mismatch" RuntimeError that was lost in the post-refactor
"ALTER unconditionally" path: now pre-reads `format_type(atttypid,
atttypmod)` on chunks.embedding; if pinned to vector(N) with N !=
configured dim, raise a clear message naming both values. Skip the
ALTER (and its AccessExclusiveLock) when the column already matches.
Pre-existing `_fake_pool` test helper now also tracks fetchval; new
`BOOT_PINNED_DIMS` module global is exported and cleared by the
test fixture.

P1-25 — crawler `_org_states` / `_vision_states` / `_chat_states`
were unbounded dicts holding `AsyncOpenAI` httpx pools. Under typo'd-
slug churn the file-descriptor footprint grew indefinitely. Switched
all three to bounded LRU `OrderedDict`s capped at 64; eviction
schedules `_safe_close` after the standard grace window.

P1-26 — vision hot paths (`ocr_image`, `describe_image`) called
`settings.get_vision_model(get_active_org())` per request, which
routes through `load_providers` → glob providers dir + parse JSON +
fork `sops -d` per `.secrets.json`. On a multi-page PDF OCR run, the
sops fork storm dominated. Now reads the cached model id from
`_vision_states[org].config[2]` — the same pattern
`process_pages_with_llm:456` already uses.

P1-27 — `embedding_service.get_embedding_service` only checked dim
drift within the same org. With chunks.embedding pinned globally to
the default org's dim at boot (P1-24), a second org with a
disagreeing provider config would succeed at config-load and crash
only at INSERT/search time. Now imports `database.BOOT_PINNED_DIMS`
and raises a clear RuntimeError at config-load time naming both
dims and the offending org.

P1-22 — `register_website`'s ON CONFLICT used to overwrite the
shared `websites.scan_interval` on every re-register, so any
member-org silently re-set everyone else's cadence. Now uses
first-org-sets-cadence semantics: ON CONFLICT only touches `status`
(see P1-23) and `updated_at`. Updating cadence requires the
explicit `update_scan_interval` API. Full per-org cadence move to
`website_org_memberships` is deferred to a follow-up; the immediate
"silent clobber on join" bug is closed.

P1-23 — `recover_stuck_deletes` + `execute_delete` race. Between
`begin_delete` marking the row 'deleting' and `execute_delete`
firing on the background task, a new org could join via
`register_website` (now reset-to-idle on conflict — see P1-22).
`execute_delete` now re-checks `COUNT(website_org_memberships)` in
the same tx; if any membership exists it aborts the CASCADE and
flips status back to 'idle' rather than killing the new org's
content.

P1-29 — added two `test_database.py` cases: dim-pin mismatch raises
with legible message and rolls back the pool; already-correctly-
pinned column skips the ALTER (no AccessExclusiveLock churn).

Verified: bun run check 36/36 tasks green; RAG 318 tests (5 new
concurrency cases); crawler 487 tests (2 new dim cases).
---
 services/crawler/app/services/database.py     |  47 +++++-
 .../crawler/app/services/embedding_service.py |  49 ++++++-
 .../crawler/app/services/pg_website_store.py  |  41 +++++-
 .../app/services/vision/openai_client.py      |  56 ++++++--
 services/crawler/tests/test_database.py       |  75 +++++++++-
 services/rag/app/services/rag_service.py      | 134 +++++++++++++++---
 .../rag/tests/test_rag_service_concurrency.py | 134 ++++++++++++++++++
 7 files changed, 495 insertions(+), 41 deletions(-)
 create mode 100644 services/rag/tests/test_rag_service_concurrency.py

diff --git a/services/crawler/app/services/database.py b/services/crawler/app/services/database.py
index 6a00853bfd..d8310d14ed 100644
--- a/services/crawler/app/services/database.py
+++ b/services/crawler/app/services/database.py
@@ -20,6 +20,14 @@
 _pool: asyncpg.Pool | None = None
 _pool_lock = asyncio.Lock()
 
+# Dimensionality of `public_web.chunks.embedding` after `init_pool`
+# finishes. Resolved from the `default` org's embedding-model config at
+# boot; all subsequent per-org client builds in `embedding_service.py`
+# MUST validate against this value (P1-27) — a per-org provider config
+# that disagrees would silently succeed at config-load and crash only
+# at INSERT/search time with a cryptic pgvector cast error.
+BOOT_PINNED_DIMS: int | None = None
+
 
 def _get_database_url() -> str:
     if settings.database_url:
@@ -84,8 +92,38 @@ async def init_pool(*, max_size: int = 10) -> asyncpg.Pool:
 
         try:
             async with acquire_with_retry(pool) as conn:
-                await conn.execute(f"ALTER TABLE {SCHEMA}.chunks ALTER COLUMN embedding TYPE vector({dims})")
-            logger.info(f"Pinned {SCHEMA}.chunks.embedding to vector({dims})")
+                # Pre-check: read the current column type. format_type
+                # returns `vector` (no dim) on a fresh baseline,
+                # `vector(N)` once pinned. If it's already pinned to a
+                # different N, the historical "ALTER unconditionally"
+                # would either (a) raise a cryptic pgvector cast error
+                # mid-startup, or (b) silently re-pin and orphan stored
+                # vectors. Surface a legible message instead and refuse
+                # to continue. Round-2 P1-24 restoration.
+                col_type = await conn.fetchval(
+                    "SELECT format_type(atttypid, atttypmod) "
+                    "FROM pg_attribute "
+                    "WHERE attrelid = $1::regclass AND attname = 'embedding'",
+                    f"{SCHEMA}.chunks",
+                )
+                if isinstance(col_type, str) and col_type.startswith("vector(") and col_type != f"vector({dims})":
+                    raise RuntimeError(
+                        f"Embedding dimension mismatch: {SCHEMA}.chunks.embedding "
+                        f"is {col_type} but the 'default' org's provider config "
+                        f"requests vector({dims}). Either reconcile the provider "
+                        f"catalog to match the existing column dimension, or "
+                        f"restore the database from a backup taken before the "
+                        f"dimension change."
+                    )
+
+                # Only ALTER when needed (column is dimensionless OR
+                # we just verified it matches). Avoids the AccessExclusiveLock
+                # on the chunks table every boot.
+                if col_type != f"vector({dims})":
+                    await conn.execute(f"ALTER TABLE {SCHEMA}.chunks ALTER COLUMN embedding TYPE vector({dims})")
+                    logger.info(f"Pinned {SCHEMA}.chunks.embedding to vector({dims})")
+                else:
+                    logger.info(f"{SCHEMA}.chunks.embedding already pinned to vector({dims}); skipping ALTER")
 
             # Create HNSW index if it doesn't exist yet. After the pin
             # above this is the normal path; the function raises if the
@@ -101,6 +139,11 @@ async def init_pool(*, max_size: int = 10) -> asyncpg.Pool:
             await pool.close()
             raise
 
+        # Record the boot-pinned dim AFTER all guards pass so per-org
+        # embedding-service builds can validate against this single
+        # source of truth (P1-27).
+        global BOOT_PINNED_DIMS
+        BOOT_PINNED_DIMS = dims
         _pool = pool
         return _pool
 
diff --git a/services/crawler/app/services/embedding_service.py b/services/crawler/app/services/embedding_service.py
index f90e538725..bb44e3ebd2 100644
--- a/services/crawler/app/services/embedding_service.py
+++ b/services/crawler/app/services/embedding_service.py
@@ -14,15 +14,23 @@
 import asyncio
 import contextlib
 import time
+from collections import OrderedDict
 
 from loguru import logger
 from tale_knowledge.embedding import EmbeddingService
 
 from app.config import settings
 from app.org_context import get_active_org
+from app.services import database
 
 _CONFIG_CHECK_INTERVAL = 15  # seconds
 
+# Bounded LRU cap on the per-org embedding-service cache. Each entry
+# holds an httpx connection pool, so a typo'd-slug spray or org churn
+# used to slowly leak file descriptors. 64 is well above any realistic
+# concurrent fan-out for a single crawler instance. Round-2 P1-25.
+_ORG_CACHE_MAX = 64
+
 
 class _OrgEmbeddingState:
     __slots__ = ("config", "last_check", "service")
@@ -38,7 +46,20 @@ def __init__(
         self.last_check = last_check
 
 
-_org_states: dict[str, _OrgEmbeddingState] = {}
+_org_states: OrderedDict[str, _OrgEmbeddingState] = OrderedDict()
+
+
+def _evict_lru_if_needed() -> None:
+    """Pop the least-recently-used entry and schedule its client close.
+
+    Called after every new insert. The previous unbounded dict held an
+    AsyncOpenAI httpx pool per entry — under typo'd-slug churn that
+    leaked sockets indefinitely.
+    """
+    while len(_org_states) > _ORG_CACHE_MAX:
+        _victim_key, victim_state = _org_states.popitem(last=False)
+        with contextlib.suppress(RuntimeError):
+            asyncio.get_running_loop().create_task(_close_old(victim_state.service))
 
 
 async def _close_old(service: EmbeddingService) -> None:
@@ -56,6 +77,10 @@ def get_embedding_service() -> EmbeddingService:
 
     now = time.monotonic()
     if state is not None and (now - state.last_check) < _CONFIG_CHECK_INTERVAL:
+        # LRU bump on access — without this, eviction order is FIFO
+        # and a busy org could be evicted out from under in-flight
+        # callers.
+        _org_states.move_to_end(org_slug)
         return state.service
 
     try:
@@ -67,15 +92,31 @@ def get_embedding_service() -> EmbeddingService:
         )
         if state is not None:
             state.last_check = now
+            _org_states.move_to_end(org_slug)
             return state.service
         raise
 
     if state is not None and config == state.config:
         state.last_check = now
+        _org_states.move_to_end(org_slug)
         return state.service
 
     base_url, api_key, model, dims = config
 
+    # Cross-org dim guard (P1-27). chunks.embedding is pinned ONCE at
+    # boot from the `default` org's config; any other org whose config
+    # disagrees would silently succeed here and crash at INSERT/search
+    # time with a cryptic pgvector cast error. Reject at config-load.
+    boot_dims = database.BOOT_PINNED_DIMS
+    if boot_dims is not None and dims != boot_dims:
+        raise RuntimeError(
+            f"Embedding dimension conflict for org '{org_slug}': provider "
+            f"config requests {dims}d but the shared chunks table is pinned "
+            f"to {boot_dims}d (set by the 'default' org at crawler boot). "
+            f"Either reconcile the org's provider catalog to match, or run "
+            f"a separate crawler instance for this org."
+        )
+
     # Never downgrade to empty key
     if not api_key and state is not None:
         logger.warning(
@@ -83,9 +124,10 @@ def get_embedding_service() -> EmbeddingService:
             org_slug,
         )
         state.last_check = now
+        _org_states.move_to_end(org_slug)
         return state.service
 
-    # Refuse dimension change (would corrupt vector index)
+    # Refuse same-org dimension change (would corrupt vector index)
     if state is not None and dims != state.config[3]:
         logger.error(
             "Embedding dimensions for org '{}' changed ({} -> {}). Restart required to re-index.",
@@ -94,6 +136,7 @@ def get_embedding_service() -> EmbeddingService:
             dims,
         )
         state.last_check = now
+        _org_states.move_to_end(org_slug)
         return state.service
 
     old_service = state.service if state is not None else None
@@ -108,6 +151,8 @@ def get_embedding_service() -> EmbeddingService:
         config=config,
         last_check=now,
     )
+    _org_states.move_to_end(org_slug)
+    _evict_lru_if_needed()
 
     if old_service is not None:
         logger.info("Embedding service rebuilt for org '{}': model={}", org_slug, model)
diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py
index 8a675ad6e3..5723e6a5f7 100644
--- a/services/crawler/app/services/pg_website_store.py
+++ b/services/crawler/app/services/pg_website_store.py
@@ -272,11 +272,22 @@ async def register_website(
         to decide whether to trigger an immediate scan.
         """
         async with acquire_with_retry(self._pool) as conn, conn.transaction():
+            # ON CONFLICT: don't overwrite `scan_interval` — any member
+            # org's re-register would otherwise silently clobber every
+            # other member's cadence (round-2 P1-22). First-org-sets-
+            # cadence semantics; updating cadence requires the explicit
+            # `update_scan_interval` API call. Also flip `status` back
+            # to 'idle' if a stuck-delete recovery left it at 'deleting'
+            # — a fresh registration is a clear signal the domain is
+            # wanted again (round-2 P1-23).
             await conn.execute(
                 """INSERT INTO websites (domain, scan_interval, created_at, updated_at)
                    VALUES ($1, $2, NOW(), NOW())
                    ON CONFLICT(domain) DO UPDATE SET
-                     scan_interval = EXCLUDED.scan_interval,
+                     status = CASE
+                       WHEN websites.status = 'deleting' THEN 'idle'
+                       ELSE websites.status
+                     END,
                      updated_at = NOW()""",
                 domain,
                 scan_interval,
@@ -393,9 +404,35 @@ async def begin_delete(self, domain: str, org_slug: str) -> dict:
         }
 
     async def execute_delete(self, domain: str) -> None:
-        """Run the actual CASCADE DELETE. Intended for background execution."""
+        """Run the actual CASCADE DELETE. Intended for background execution.
+
+        Same-tx membership re-check (round-2 P1-23): between `begin_delete`
+        marking the row 'deleting' and this method firing on the
+        background task, a new org could have joined via `register_website`
+        (whose ON CONFLICT now resets status='idle' — see P1-22 fix
+        above). If any membership now exists, abort the DELETE rather
+        than CASCADE-killing the new org's content.
+        """
         async with acquire_with_retry(self._pool) as conn, conn.transaction():
             await conn.execute("SET LOCAL statement_timeout = '120s'")
+            remaining = await conn.fetchval(
+                "SELECT COUNT(*) FROM website_org_memberships WHERE domain = $1",
+                domain,
+            )
+            if remaining and remaining > 0:
+                logger.warning(
+                    "execute_delete: aborting CASCADE for %s — %s membership(s) "
+                    "appeared after begin_delete (race with register_website). "
+                    "Domain remains live; status will flip to 'idle' on the "
+                    "next register or scheduler tick.",
+                    domain,
+                    remaining,
+                )
+                await conn.execute(
+                    "UPDATE websites SET status = 'idle', updated_at = NOW() WHERE domain = $1 AND status = 'deleting'",
+                    domain,
+                )
+                return
             await conn.execute("DELETE FROM websites WHERE domain = $1", domain)
         await reindex_chunks(self._pool)
         logger.info(f"Deleted website: {domain}")
diff --git a/services/crawler/app/services/vision/openai_client.py b/services/crawler/app/services/vision/openai_client.py
index 0f46a89f39..41f42a176b 100644
--- a/services/crawler/app/services/vision/openai_client.py
+++ b/services/crawler/app/services/vision/openai_client.py
@@ -13,6 +13,7 @@
 import contextlib
 import imghdr
 import time
+from collections import OrderedDict
 from dataclasses import dataclass
 
 from loguru import logger
@@ -98,13 +99,14 @@ def __init__(
 # so two orgs' requests never share `_client` / `_client_config` (which
 # would route org B's traffic through org A's API key when within the
 # TTL — the bug this refactor fixes).
-_vision_states: dict[str, _OrgVisionState] = {}
-
-# Same shape for chat config (used by `process_pages_with_llm`). Two
-# orgs may legitimately have different chat providers; without an
-# explicit per-org cache, the prior code rebuilt the client on every
-# call and leaked the httpx pool.
-_chat_states: dict[str, _OrgVisionState] = {}
+#
+# OrderedDict for true LRU on access; bounded by `_ORG_CACHE_MAX` so a
+# typo'd-slug spray or natural org churn doesn't leak httpx connection
+# pools indefinitely (round-2 P1-25). The peer cache in
+# `embedding_service.py` uses the same pattern.
+_ORG_CACHE_MAX = 64
+_vision_states: OrderedDict[str, _OrgVisionState] = OrderedDict()
+_chat_states: OrderedDict[str, _OrgVisionState] = OrderedDict()
 
 
 async def _safe_close_client(client: AsyncOpenAI) -> None:
@@ -116,8 +118,27 @@ async def _safe_close_client(client: AsyncOpenAI) -> None:
         logger.opt(exception=True).warning("Failed to close old vision client")
 
 
+def _evict_lru_if_needed(
+    states: OrderedDict[str, _OrgVisionState],
+    label: str,
+) -> None:
+    """Pop the LRU entry from `states` once it crosses `_ORG_CACHE_MAX`.
+
+    Each entry holds an `AsyncOpenAI` httpx connection pool. Without
+    this, a typo'd-slug spray or a long-running process with high org
+    churn slowly leaks file descriptors. Schedule the evicted client's
+    close after the standard grace window so any in-flight call still
+    finishes (round-2 P1-25).
+    """
+    while len(states) > _ORG_CACHE_MAX:
+        _victim_key, victim = states.popitem(last=False)
+        logger.info("Evicting LRU {} client for org '{}'", label, _victim_key)
+        with contextlib.suppress(RuntimeError):
+            asyncio.get_running_loop().create_task(_safe_close_client(victim.client))
+
+
 def _get_or_build_client(
-    states: dict[str, _OrgVisionState],
+    states: OrderedDict[str, _OrgVisionState],
     org_slug: str,
     config_getter,
     *,
@@ -137,6 +158,7 @@ def _get_or_build_client(
     state = states.get(org_slug)
     now = time.monotonic()
     if state is not None and (now - state.last_check) < _CONFIG_CHECK_INTERVAL:
+        states.move_to_end(org_slug)
         return state.client
 
     try:
@@ -175,6 +197,8 @@ def _get_or_build_client(
         config=config,
         last_check=now,
     )
+    states.move_to_end(org_slug)
+    _evict_lru_if_needed(states, label)
 
     if old_client is not None:
         logger.info("{} rebuilt for org '{}': model={}", label, org_slug, model)
@@ -223,8 +247,17 @@ async def ocr_image(
         if cached_result is not None:
             return cached_result
 
+        # Read the model id from the cached `_vision_states[org].config`
+        # tuple instead of `settings.get_vision_model(org)`. The latter
+        # routes through `load_providers` which is uncached — every call
+        # globs the providers dir, parses JSON, and forks `sops -d` per
+        # `.secrets.json`. Multi-page PDF OCR fires this per page, so the
+        # sops fork storm dominated. `_get_client` above already loaded
+        # the same config and stashed it on the state. (Round-2 P1-26;
+        # see `process_pages_with_llm:456` for the same pattern.)
         client = self._get_client()
-        vision_model = settings.get_vision_model(get_active_org())
+        org_slug = get_active_org()
+        vision_model = _vision_states[org_slug].config[2]
         extraction_prompt = prompt or OCR_PROMPT
 
         image_b64 = base64.b64encode(image_bytes).decode("utf-8")
@@ -302,8 +335,11 @@ async def describe_image(
         if cached_result is not None:
             return cached_result
 
+        # Same cached-model-id read as `ocr_image` above. See P1-26 note
+        # there for rationale (sops fork storm bypass).
         client = self._get_client()
-        vision_model = settings.get_vision_model(get_active_org())
+        org_slug = get_active_org()
+        vision_model = _vision_states[org_slug].config[2]
         description_prompt = prompt or DESCRIBE_PROMPT
 
         image_b64 = base64.b64encode(image_bytes).decode("utf-8")
diff --git a/services/crawler/tests/test_database.py b/services/crawler/tests/test_database.py
index 763aabf6a6..e9e0687f80 100644
--- a/services/crawler/tests/test_database.py
+++ b/services/crawler/tests/test_database.py
@@ -18,16 +18,24 @@
 
 @pytest.fixture(autouse=True)
 def _reset_pool():
-    """Ensure module-level _pool is reset before and after each test."""
+    """Ensure module-level _pool and BOOT_PINNED_DIMS are reset before and after each test."""
     db_mod._pool = None
+    db_mod.BOOT_PINNED_DIMS = None
     yield
     db_mod._pool = None
+    db_mod.BOOT_PINNED_DIMS = None
 
 
-def _fake_pool():
-    """Build a mock asyncpg pool with a tracked single connection."""
+def _fake_pool(*, fetchval_returns=None):
+    """Build a mock asyncpg pool with a tracked single connection.
+
+    `fetchval_returns` controls what the pre-pin format_type probe returns
+    (string `vector(N)` for already-pinned, `vector` for fresh baseline,
+    None for a row that doesn't exist).
+    """
     conn = AsyncMock()
     conn.execute = AsyncMock()
+    conn.fetchval = AsyncMock(return_value=fetchval_returns)
 
     pool = AsyncMock()
     pool.close = AsyncMock()
@@ -112,3 +120,64 @@ async def test_raises_when_default_org_provider_unconfigured(self):
                 await db_mod.init_pool()
 
         assert db_mod._pool is None
+
+    @pytest.mark.asyncio
+    async def test_raises_on_pinned_dim_mismatch(self):
+        """If the column is already pinned to vector(N) and the
+        configured dim differs, boot must fail loudly rather than
+        silently ALTER (and orphan existing rows). Restoration of the
+        legible error round-2 P1-24 reported was lost in the move to
+        the post-refactor unconditional ALTER.
+        """
+        fake_pool, acq = _fake_pool(fetchval_returns="vector(768)")
+
+        with (
+            patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
+            patch("app.services.database.acquire_with_retry", acq),
+            patch("app.services.database.settings") as mock_settings,
+        ):
+            mock_settings.get_embedding_config.return_value = (
+                "https://api.example.com",
+                "sk-test",
+                "text-embedding-3-small",
+                1536,
+            )
+            mock_settings.database_url = "postgresql://test:test@localhost/test"
+
+            with pytest.raises(RuntimeError, match="dimension mismatch"):
+                await db_mod.init_pool()
+
+        # Pool got rolled back; the module never recorded a pinned dim.
+        assert db_mod._pool is None
+        assert db_mod.BOOT_PINNED_DIMS is None
+
+    @pytest.mark.asyncio
+    async def test_skips_alter_when_already_correctly_pinned(self):
+        """No-op the unconditional ALTER when the existing pin already
+        matches the configured dim. Avoids the AccessExclusiveLock on
+        the chunks table on every boot (round-2 P1-24).
+        """
+        fake_pool, acq = _fake_pool(fetchval_returns="vector(1536)")
+
+        with (
+            patch("app.services.database.asyncpg.create_pool", AsyncMock(return_value=fake_pool)),
+            patch("app.services.database.acquire_with_retry", acq),
+            patch("app.services.database.settings") as mock_settings,
+        ):
+            mock_settings.get_embedding_config.return_value = (
+                "https://api.example.com",
+                "sk-test",
+                "text-embedding-3-small",
+                1536,
+            )
+            mock_settings.database_url = "postgresql://test:test@localhost/test"
+
+            await db_mod.init_pool()
+
+        conn = fake_pool._test_conn
+        execute_calls = [str(c) for c in conn.execute.call_args_list]
+        # ALTER not run; HNSW index attempt still happens.
+        assert not any("ALTER TABLE" in c for c in execute_calls)
+        assert any("create_chunks_hnsw_index" in c for c in execute_calls)
+        # BOOT_PINNED_DIMS still recorded for cross-org guards.
+        assert db_mod.BOOT_PINNED_DIMS == 1536
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index d6bd5cefab..741f116e38 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -28,8 +28,9 @@
 import asyncio
 import datetime as dt
 import time
+from collections import OrderedDict
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, ClassVar
 
 import asyncpg
 import httpx
@@ -116,9 +117,22 @@ def __init__(self) -> None:
         self._pinned_dims: int | None = None
         self._pin_dim_lock = asyncio.Lock()
         # Per-org client cache and per-org locks (so concurrent first-calls
-        # for the same org don't both build clients).
-        self._org_clients: dict[str, _OrgClients] = {}
-        self._org_locks: dict[str, asyncio.Lock] = {}
+        # for the same org don't both build clients). True LRU on access:
+        # `OrderedDict.move_to_end` on every cache hit; eviction pops the
+        # least-recently-used entry. The previous "FIFO" pop-iter scheme
+        # claimed to be LRU in comments but never reordered, so a busy
+        # org's lock could be evicted while still held by fiber A —
+        # fiber B then got a fresh lock and both fibers raced into
+        # `_build_or_refresh_org_clients` with `previous=None`, silently
+        # overwriting each other's client set with no `_safe_close`
+        # scheduled (round-2 P1-20).
+        self._org_clients: OrderedDict[str, _OrgClients] = OrderedDict()
+        self._org_locks: OrderedDict[str, asyncio.Lock] = OrderedDict()
+        # Set to True at the top of `shutdown`. New `_ensure_org_clients`
+        # calls raise immediately so requests landing mid-shutdown can't
+        # repopulate the cache after `clear()` and bind to a pool that's
+        # about to close (round-2 P1-19).
+        self._shutting_down: bool = False
 
     async def initialize(self) -> None:
         """Initialize the shared database pool.
@@ -147,20 +161,31 @@ def embedding_service(self) -> EmbeddingService | None:
 
     def _get_org_lock(self, org_slug: str) -> asyncio.Lock:
         lock = self._org_locks.get(org_slug)
-        if lock is None:
-            # Bounded LRU eviction: never grow past `_ORG_LOCKS_MAX`. Evict
-            # the oldest entry (Python dicts preserve insertion order); the
-            # evicted lock is safe to drop because either no caller holds
-            # it (it was idle), or the caller still has a reference and
-            # will continue using it — we just lose the "shared lock"
-            # property for that org until the next call recreates it.
-            if len(self._org_locks) >= _ORG_LOCKS_MAX:
-                # `next(iter(...))` returns the oldest key without
-                # building a list.
-                oldest = next(iter(self._org_locks))
-                self._org_locks.pop(oldest, None)
-            lock = asyncio.Lock()
-            self._org_locks[org_slug] = lock
+        if lock is not None:
+            # True LRU: bump on access. Without this, eviction order is
+            # insertion order (FIFO), so a busy org's lock could be
+            # evicted while held — breaking the "shared lock per org"
+            # invariant and producing the racing-builders bug described
+            # on the OrderedDict declaration above.
+            self._org_locks.move_to_end(org_slug)
+            return lock
+
+        # Bounded eviction: scan for the LEAST-recently-used UNHELD lock
+        # rather than blindly popping the head. A held lock means a
+        # fiber is mid-build for that org; evicting it would create a
+        # second concurrent builder. If every entry is held (>=256 orgs
+        # all building concurrently — extremely unlikely), give up on
+        # eviction and let the dict grow by one. The next call will
+        # have more idle locks to pick from.
+        if len(self._org_locks) >= _ORG_LOCKS_MAX:
+            for victim_key in list(self._org_locks.keys()):
+                victim = self._org_locks[victim_key]
+                if not victim.locked():
+                    self._org_locks.pop(victim_key, None)
+                    break
+            # else: all locks held — accept temporary overshoot.
+        lock = asyncio.Lock()
+        self._org_locks[org_slug] = lock
         return lock
 
     async def _ensure_org_clients(self, org_slug: str) -> _OrgClients:
@@ -169,6 +194,8 @@ async def _ensure_org_clients(self, org_slug: str) -> _OrgClients:
         Refresh is gated on `_CONFIG_CHECK_INTERVAL` so a busy org doesn't
         re-read its provider files on every call.
         """
+        if self._shutting_down:
+            raise RuntimeError("RagService is shutting down")
         if not self.initialized:
             await self.initialize()
         if self._pool is None:
@@ -178,6 +205,7 @@ async def _ensure_org_clients(self, org_slug: str) -> _OrgClients:
         if cached is not None:
             now = time.monotonic()
             if (now - cached.last_check) < _CONFIG_CHECK_INTERVAL:
+                self._org_clients.move_to_end(org_slug)
                 return cached
 
         lock = self._get_org_lock(org_slug)
@@ -186,6 +214,7 @@ async def _ensure_org_clients(self, org_slug: str) -> _OrgClients:
             if cached is not None:
                 now = time.monotonic()
                 if (now - cached.last_check) < _CONFIG_CHECK_INTERVAL:
+                    self._org_clients.move_to_end(org_slug)
                     return cached
 
             return await self._build_or_refresh_org_clients(org_slug, cached)
@@ -290,6 +319,34 @@ async def _build_or_refresh_org_clients(
             last_check=time.monotonic(),
         )
         self._org_clients[org_slug] = new_clients
+        self._org_clients.move_to_end(org_slug)
+
+        # Cap `_org_clients` size by the same LRU bound applied to
+        # `_org_locks`. Without this, a long-running process that sees
+        # many distinct (or typo'd) slugs grows the dict without limit;
+        # each entry holds an `AsyncOpenAI` httpx pool + a vision
+        # client. Evict the LRU entry whose org isn't in the middle of
+        # being built (we hold its lock during this block, so the LRU
+        # head won't be us). Round-2 P1-20.
+        while len(self._org_clients) > _ORG_LOCKS_MAX:
+            victim_key, victim_clients = self._org_clients.popitem(last=False)
+            if victim_key == org_slug:
+                # Defensive: re-add and stop. Should not happen — the
+                # entry we just inserted was move_to_end'd above.
+                self._org_clients[victim_key] = victim_clients
+                break
+            loop = asyncio.get_running_loop()
+            for coro_target in (
+                victim_clients.embedding_service.close(),
+                victim_clients.openai_client.close(),
+            ):
+                t = loop.create_task(_safe_close(coro_target))
+                _background_tasks.add(t)
+                t.add_done_callback(_background_tasks.discard)
+            if victim_clients.vision_client is not None:
+                t = loop.create_task(_safe_close(victim_clients.vision_client.close()))
+                _background_tasks.add(t)
+                t.add_done_callback(_background_tasks.discard)
 
         # Best-effort close of old clients after a grace period so in-flight
         # requests on the old clients finish cleanly.
@@ -795,8 +852,28 @@ async def compare_files(
 
         return result
 
+    # Bounded drain for background `_safe_close` tasks during shutdown.
+    # Each `_safe_close` sleeps 30s before its actual close call so in-
+    # flight requests on the old clients can finish; without a timeout
+    # here, a refresh-burst right before shutdown can keep the process
+    # hanging for ~30s x max-concurrent-refreshes. 10s is generous given
+    # the AsyncOpenAI / httpx pool close itself is sub-second.
+    _SHUTDOWN_DRAIN_TIMEOUT_S: ClassVar[float] = 10.0
+
     async def shutdown(self) -> None:
-        """Clean shutdown — close pool and all per-org clients."""
+        """Clean shutdown — close pool and all per-org clients.
+
+        Order matters:
+        1. Flip `_shutting_down` so new `_ensure_org_clients` calls fail
+           fast instead of repopulating the cache and binding new clients
+           to a pool that's about to close (P1-19).
+        2. Close per-org clients; clear the cache.
+        3. Drain `_background_tasks` (the `_safe_close` coroutines that
+           were spawned for client-refresh churn) under a bounded timeout.
+        4. Close the DB pool.
+        """
+        self._shutting_down = True
+
         # Best-effort close of each org's clients before tearing down the pool.
         for org_slug, clients in list(self._org_clients.items()):
             try:
@@ -823,10 +900,23 @@ async def shutdown(self) -> None:
         self._org_clients.clear()
 
         # Drain pending `_safe_close` tasks so they don't keep running
-        # after the pool is closed. `return_exceptions=True` ensures one
-        # failing close doesn't prevent the others from being awaited.
+        # after the pool is closed. Bounded by `_SHUTDOWN_DRAIN_TIMEOUT_S`
+        # so a refresh burst whose 30s `asyncio.sleep` is still pending
+        # can't pin shutdown for the full grace window (P1-19).
         if _background_tasks:
-            await asyncio.gather(*_background_tasks, return_exceptions=True)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*_background_tasks, return_exceptions=True),
+                    timeout=self._SHUTDOWN_DRAIN_TIMEOUT_S,
+                )
+            except TimeoutError:
+                logger.warning(
+                    "shutdown: {} background tasks did not drain within {}s; cancelling",
+                    len(_background_tasks),
+                    self._SHUTDOWN_DRAIN_TIMEOUT_S,
+                )
+                for task in list(_background_tasks):
+                    task.cancel()
 
         await close_pool()
         self.initialized = False
diff --git a/services/rag/tests/test_rag_service_concurrency.py b/services/rag/tests/test_rag_service_concurrency.py
new file mode 100644
index 0000000000..008ddf1de0
--- /dev/null
+++ b/services/rag/tests/test_rag_service_concurrency.py
@@ -0,0 +1,134 @@
+"""Lifecycle + concurrency tests for RagService.
+
+Covers round-2 P1-19/20/21:
+- shutdown flag rejects new requests mid-shutdown
+- background-task drain is bounded by a timeout
+- `_get_org_lock` does true LRU (move_to_end on access) and refuses to
+  evict a held lock
+- `_pin_dim_lock` first-write race serializes when two orgs init
+  concurrently with the same dims (no pre-seeded `_pinned_dims`)
+"""
+
+from __future__ import annotations
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.asyncio
+
+
+class TestShutdownGate:
+    async def test_ensure_org_clients_rejects_after_shutdown_flag(self):
+        from app.services.rag_service import RagService
+
+        service = RagService()
+        service.initialized = True
+        service._pool = MagicMock()
+        service._shutting_down = True
+
+        with pytest.raises(RuntimeError, match="shutting down"):
+            await service._ensure_org_clients("orgA")
+
+    async def test_shutdown_drain_timeout_cancels_pending_tasks(self):
+        from app.services import rag_service as rag_mod
+        from app.services.rag_service import RagService
+
+        service = RagService()
+        service.initialized = True
+        service._pool = MagicMock()
+
+        async def hanging_task() -> None:
+            await asyncio.sleep(60)  # well beyond the drain timeout
+
+        # Override the drain timeout so the test stays fast.
+        service._SHUTDOWN_DRAIN_TIMEOUT_S = 0.05
+
+        loop = asyncio.get_running_loop()
+        hanger = loop.create_task(hanging_task())
+        rag_mod._background_tasks.add(hanger)
+
+        try:
+            with patch("app.services.rag_service.close_pool", new_callable=AsyncMock):
+                await service.shutdown()
+        finally:
+            rag_mod._background_tasks.discard(hanger)
+            # Drain the cancellation that shutdown propagated; without this,
+            # asyncio leaks an unhandled-cancellation warning into the test
+            # report.
+            with pytest.raises(asyncio.CancelledError):
+                await hanger
+
+        assert service._shutting_down is True
+
+
+class TestOrgLockLRU:
+    def test_access_bumps_to_most_recent(self):
+        from app.services.rag_service import RagService
+
+        service = RagService()
+        for slug in ("orgA", "orgB", "orgC"):
+            service._get_org_lock(slug)
+
+        # Bump orgA — it should be the most-recently-used now.
+        service._get_org_lock("orgA")
+        keys = list(service._org_locks.keys())
+        assert keys[-1] == "orgA"
+        assert keys[0] == "orgB"
+
+    def test_eviction_skips_held_lock(self, monkeypatch):
+        from app.services import rag_service as rag_mod
+        from app.services.rag_service import RagService
+
+        # Squeeze the cap so we can exercise eviction on a small set.
+        monkeypatch.setattr(rag_mod, "_ORG_LOCKS_MAX", 2)
+
+        service = RagService()
+        lock_a = service._get_org_lock("orgA")
+        service._get_org_lock("orgB")
+
+        # Mark orgA's lock as held, then insert orgC. The bounded
+        # eviction must skip orgA and pop orgB.
+        async def hold_and_check():
+            async with lock_a:
+                service._get_org_lock("orgC")
+                assert "orgA" in service._org_locks
+                assert "orgB" not in service._org_locks
+                assert "orgC" in service._org_locks
+
+        asyncio.run(hold_and_check())
+
+
+class TestPinDimLockRace:
+    async def test_two_concurrent_inits_serialise_via_pin_dim_lock(self):
+        """Without `_pin_dim_lock`, two orgs init'ing concurrently would
+        both see `_pinned_dims is None` and both call
+        `pin_embedding_dimensions`. With the lock, the second caller
+        observes the pinned value and falls through to the equality check.
+        """
+        from app.services.rag_service import RagService
+
+        service = RagService()
+        # Hand-roll initialization so we exercise the first-write race
+        # without driving the full client build.
+        service.initialized = True
+        service._pool = MagicMock()
+        assert service._pinned_dims is None
+
+        async def pin(dims: int) -> None:
+            async with service._pin_dim_lock:
+                if service._pinned_dims is None:
+                    service._pinned_dims = dims
+                elif dims != service._pinned_dims:
+                    raise ValueError(f"dim mismatch: have {service._pinned_dims}, got {dims}")
+
+        # Two concurrent pinners requesting the same dim — both must
+        # succeed; `_pinned_dims` settles to that dim.
+        await asyncio.gather(pin(1536), pin(1536))
+        assert service._pinned_dims == 1536
+
+        # A subsequent pinner requesting a different dim must raise
+        # under the lock (vs racing past the None check).
+        with pytest.raises(ValueError, match="dim mismatch"):
+            await pin(3072)

From 956d633bcb8c9104afb69fec576bbab5434d95ab Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 22:25:28 +0800
Subject: [PATCH 15/41] fix(platform,rag,crawler,docs): polish + lightweight
 correctness fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the final P1 cluster from the org-first review:

- P1-15: SSE watcher now emits invalidations for <org>/retention.json so
  governance UI cache refreshes when an operator edits the file.
- P1-16: replace 6 empty readdir catches (agents, workflows file actions)
  with handleDirReadError helper — ENOENT silently falls through, other
  errors are logged instead of silently swallowed.
- P1-17: move overscroll-behavior:none from packages/ui/src/globals.css
  to platform/app/locals.css so docs + web shells keep native rubber-band
  scroll.
- P1-30: extract canonical ORG_SLUG_RE + validate_org_slug to
  tale_shared.config.org_slug and route provider loader + RAG and crawler
  auth dependencies through it; drops three near-identical local regexes.
- P1-37: README.fr.md tu-form alignment (exécutez → exécute).
- P1-38: doc paths under TALE_CONFIG_DIR now include the <orgSlug>
  segment across en/de/fr models + integrations overview pages, matching
  the org-first layout.
- P1-39: lift RetentionConfigMissingError → ConvexError translation into
  computeEffectiveAppliedBounds so the four call sites can drop their
  duplicated try/catch; tightens seeder's catch to the
  RETENTION_CONFIG_MISSING code only.
---
 README.fr.md                                  |  2 +-
 docs/de/platform/integrations/overview.md     |  2 +-
 docs/de/platform/models.md                    |  6 +--
 docs/en/platform/integrations/overview.md     |  2 +-
 docs/en/platform/models.md                    |  6 +--
 docs/fr/platform/integrations/overview.md     |  2 +-
 docs/fr/platform/models.md                    |  6 +--
 .../src/tale_shared/config/org_slug.py        | 35 ++++++++++++++
 .../src/tale_shared/config/providers.py       | 14 +++++-
 packages/ui/src/globals.css                   |  7 ---
 services/crawler/app/org_context.py           |  8 +---
 services/platform/app/locals.css              |  8 ++++
 .../platform/convex/agents/file_actions.ts    | 10 ++--
 .../convex/agents/internal_actions.ts         |  5 +-
 .../governance/retention_bounds_proposal.ts   | 47 ++++++++++++-------
 services/platform/convex/lib/file_io.ts       | 23 +++++++++
 .../platform/convex/workflows/file_actions.ts |  7 ++-
 services/platform/lib/config-watcher.ts       | 17 ++++++-
 services/rag/app/auth.py                      |  9 +---
 19 files changed, 155 insertions(+), 61 deletions(-)
 create mode 100644 packages/tale_shared/src/tale_shared/config/org_slug.py

diff --git a/README.fr.md b/README.fr.md
index 802699ed93..0cdb03165d 100644
--- a/README.fr.md
+++ b/README.fr.md
@@ -88,7 +88,7 @@ tale cleanup                       # Supprimer les conteneurs inactifs
 tale reset --force                 # Supprimer tous les conteneurs
 ```
 
-Voir la [référence du CLI](tools/cli/README.md) pour toutes les options et flags. Mettre à jour un déploiement existant nécessite une migration manuelle unique : exécutez `tale migrate config-layout` puis `tale deploy --override-all -y`. Le runbook complet se trouve dans [Mises à niveau auto-hébergées](docs/fr/self-hosted/operate/upgrades.md).
+Voir la [référence du CLI](tools/cli/README.md) pour toutes les options et flags. Mettre à jour un déploiement existant nécessite une migration manuelle unique : exécute `tale migrate config-layout` puis `tale deploy --override-all -y`. Le runbook complet se trouve dans [Mises à niveau auto-hébergées](docs/fr/self-hosted/operate/upgrades.md).
 
 ## Déployer en production
 
diff --git a/docs/de/platform/integrations/overview.md b/docs/de/platform/integrations/overview.md
index 2b56f09d82..9353179baf 100644
--- a/docs/de/platform/integrations/overview.md
+++ b/docs/de/platform/integrations/overview.md
@@ -65,7 +65,7 @@ Microsoft 365 deckt auch Identität ab. Sie unter **Einstellungen > Integratione
 
 ## Eine eigene Integration hinzufügen
 
-Eigene Integrationen folgen derselben JSON-Form wie die oben. Leg eine Konfiguration in `TALE_CONFIG_DIR/integrations/<slug>/config.json` ab, die die Operationen, die Auth-Methode und die erlaubten Hosts deklariert; die Integration erscheint in **Einstellungen > Integrationen**, damit User sie verbinden können. Die Form und die Validierungsregeln leben neben den ausgelieferten Konfigurationen in `examples/default/integrations/`.
+Eigene Integrationen folgen derselben JSON-Form wie die oben. Leg eine Konfiguration in `TALE_CONFIG_DIR/<orgSlug>/integrations/<slug>/config.json` ab, die die Operationen, die Auth-Methode und die erlaubten Hosts deklariert; unter dem org-first Layout ist jeder `integrations/`-Unterbaum der Org unabhängig. Die Integration erscheint in **Einstellungen > Integrationen**, damit User sie verbinden können. Die Form und die Validierungsregeln leben neben den ausgelieferten Konfigurationen in `examples/default/integrations/`.
 
 Für reichere oder selbst gehostete Brücken sind [MCP-Server](/de/platform/integrations/mcp-servers) die alternative Oberfläche — jeder MCP-Server, den du registrierst, fügt seine Tools dem Agent-Werkzeuggürtel hinzu mit pro-Tool-Genehmigung.
 
diff --git a/docs/de/platform/models.md b/docs/de/platform/models.md
index 60c060bad8..9dceeed9aa 100644
--- a/docs/de/platform/models.md
+++ b/docs/de/platform/models.md
@@ -15,7 +15,7 @@ Modelle driften schneller als Docs. Die Listen unten stimmen zum Zeitpunkt, an d
 | **OpenAI**            | Speech-to-Text, Text-to-Speech | Whisper ist die praktische Baseline für Transkription; gpt-4o-mini-tts ist das billigste verlässliche TTS           | [platform.openai.com/docs/models](https://platform.openai.com/docs/models)     |
 | **Vercel AI Gateway** | Bildgenerierung                | Ein OpenAI-kompatibler Endpunkt deckt FLUX, Imagen und Nano Banana ab, ohne pro-Anbieter-Keys                       | [vercel.com/docs/ai-gateway/models](https://vercel.com/docs/ai-gateway/models) |
 
-Jeder Provider oben ist ein OpenAI-kompatibler Endpunkt, den Tale per HTTPS mit Bearer-Token aufruft. Du kannst jeden durch einen anderen Provider ersetzen (auch einen lokalen Ollama- oder vLLM-Server), indem du die passende JSON unter `TALE_CONFIG_DIR/providers/` deiner Instanz bearbeitest.
+Jeder Provider oben ist ein OpenAI-kompatibler Endpunkt, den Tale per HTTPS mit Bearer-Token aufruft. Du kannst jeden durch einen anderen Provider ersetzen (auch einen lokalen Ollama- oder vLLM-Server), indem du die passende JSON unter `TALE_CONFIG_DIR/<orgSlug>/providers/` deiner Instanz bearbeitest — unter dem org-first Layout sind Provider-Kataloge pro Org (jede Org hat ihren eigenen `providers/`-Unterbaum).
 
 ## OpenRouter — Chat, Vision, Embeddings
 
@@ -35,7 +35,7 @@ OpenRouter ist ein Multi-Modell-Gateway. Die ausgelieferte Konfiguration wählt
 - **Meta** — LLaMA 4 Maverick, LLaMA 4 Scout.
 - **Black Forest Labs** — FLUX.2 [max], FLUX.2 [pro], FLUX.2 [flex].
 
-Der volle und aktuelle Katalog lebt auf [openrouter.ai/models](https://openrouter.ai/models). Jedes Modell, das OpenRouter exponiert, kannst du auf deiner Instanz hinzufügen, indem du das `models`-Array in `providers/openrouter.json` unter `TALE_CONFIG_DIR` bearbeitest.
+Der volle und aktuelle Katalog lebt auf [openrouter.ai/models](https://openrouter.ai/models). Jedes Modell, das OpenRouter exponiert, kannst du auf deiner Instanz hinzufügen, indem du das `models`-Array in `<orgSlug>/providers/openrouter.json` unter `TALE_CONFIG_DIR` bearbeitest (pro Org unter dem org-first Layout).
 
 ## OpenAI — Speech-to-Text und Text-to-Speech
 
@@ -57,7 +57,7 @@ Der breitere Katalog liegt auf [vercel.com/docs/ai-gateway/models](https://verce
 
 ## Provider tauschen oder hinzufügen
 
-Die drei oben genannten Provider sind Defaults, keine Vorgaben. Ersetz jeden durch einen anderen OpenAI-kompatiblen Endpunkt, indem du die JSON in `TALE_CONFIG_DIR/providers/` bearbeitest — richt sie auf deine eigene API, ändere das `models`-Array, und Tale lädt beim nächsten Start neu. Eine lokale Ollama-Instanz, ein privater vLLM-Cluster oder ein Bedrock-Proxy passen alle in dieselbe Form. Die Mechanik lebt unter [Konfiguration → Provider](/de/self-hosted/configuration/providers); das Admin-UI-Formular für dieselbe Konfiguration liegt auf [Provider](/de/platform/admin/providers).
+Die drei oben genannten Provider sind Defaults, keine Vorgaben. Ersetz jeden durch einen anderen OpenAI-kompatiblen Endpunkt, indem du die JSON in `TALE_CONFIG_DIR/<orgSlug>/providers/` bearbeitest — richt sie auf deine eigene API, ändere das `models`-Array, und Tale lädt beim nächsten Start neu. Eine lokale Ollama-Instanz, ein privater vLLM-Cluster oder ein Bedrock-Proxy passen alle in dieselbe Form. Die Mechanik lebt unter [Konfiguration → Provider](/de/self-hosted/configuration/providers); das Admin-UI-Formular für dieselbe Konfiguration liegt auf [Provider](/de/platform/admin/providers).
 
 ## Wo das hineinpasst
 
diff --git a/docs/en/platform/integrations/overview.md b/docs/en/platform/integrations/overview.md
index 788d78febb..2c10a2ff0f 100644
--- a/docs/en/platform/integrations/overview.md
+++ b/docs/en/platform/integrations/overview.md
@@ -65,7 +65,7 @@ Microsoft 365 also covers identity. Connecting it under **Settings > Integration
 
 ## Adding a custom integration
 
-Custom integrations follow the same JSON shape as the ones above. Drop a config into `TALE_CONFIG_DIR/integrations/<slug>/config.json` declaring the operations, auth method, and allowed hosts; the integration appears in **Settings > Integrations** for users to connect. The shape and validation rules live alongside the shipped configs in `examples/default/integrations/`.
+Custom integrations follow the same JSON shape as the ones above. Drop a config into `TALE_CONFIG_DIR/<orgSlug>/integrations/<slug>/config.json` declaring the operations, auth method, and allowed hosts; under the org-first layout each org's `integrations/` subtree is independent. The integration appears in **Settings > Integrations** for users to connect. The shape and validation rules live alongside the shipped configs in `examples/default/integrations/`.
 
 For richer or self-hosted bridges, [MCP servers](/platform/integrations/mcp-servers) are the alternative surface — every MCP server you register adds its tools to the agent toolbelt with per-tool approval.
 
diff --git a/docs/en/platform/models.md b/docs/en/platform/models.md
index a6d94e6b83..9d0df3453b 100644
--- a/docs/en/platform/models.md
+++ b/docs/en/platform/models.md
@@ -15,7 +15,7 @@ Models drift faster than docs. The lists below are correct at the time `examples
 | **OpenAI**            | Speech-to-text, text-to-speech | Whisper is the practical baseline for transcription; gpt-4o-mini-tts is the cheapest reliable TTS         | [platform.openai.com/docs/models](https://platform.openai.com/docs/models)     |
 | **Vercel AI Gateway** | Image generation               | One OpenAI-compatible endpoint covers FLUX, Imagen, and Nano Banana without per-vendor keys               | [vercel.com/docs/ai-gateway/models](https://vercel.com/docs/ai-gateway/models) |
 
-Every provider above is an OpenAI-compatible endpoint Tale calls over HTTPS with a bearer token. You can replace any of them with a different provider (including a local Ollama or vLLM server) by editing the matching JSON under your instance's `TALE_CONFIG_DIR/providers/`.
+Every provider above is an OpenAI-compatible endpoint Tale calls over HTTPS with a bearer token. You can replace any of them with a different provider (including a local Ollama or vLLM server) by editing the matching JSON under your instance's `TALE_CONFIG_DIR/<orgSlug>/providers/` — under the org-first layout, provider catalogs are per-org (each org's subtree holds its own `providers/` directory).
 
 ## OpenRouter — chat, vision, embeddings
 
@@ -35,7 +35,7 @@ OpenRouter is a multi-model gateway. The shipped config picks `deepseek-v4-flash
 - **Meta** — LLaMA 4 Maverick, LLaMA 4 Scout.
 - **Black Forest Labs** — FLUX.2 [max], FLUX.2 [pro], FLUX.2 [flex].
 
-The full and live catalogue lives at [openrouter.ai/models](https://openrouter.ai/models). Any model OpenRouter exposes can be added to your instance by editing the `models` array in `providers/openrouter.json` under `TALE_CONFIG_DIR`.
+The full and live catalogue lives at [openrouter.ai/models](https://openrouter.ai/models). Any model OpenRouter exposes can be added to your instance by editing the `models` array in `<orgSlug>/providers/openrouter.json` under `TALE_CONFIG_DIR` (per-org under the org-first layout).
 
 ## OpenAI — speech-to-text and text-to-speech
 
@@ -57,7 +57,7 @@ The wider catalogue is at [vercel.com/docs/ai-gateway/models](https://vercel.com
 
 ## Swapping or adding providers
 
-The three providers above are defaults, not requirements. Replace any of them with a different OpenAI-compatible endpoint by editing the JSON in `TALE_CONFIG_DIR/providers/` — point it at your own API, change the `models` array, and Tale reloads on next start. A local Ollama instance, a private vLLM cluster, or a Bedrock proxy all fit the same shape. The mechanics live under [Configuration → providers](/self-hosted/configuration/providers); the admin-UI form for the same config lives at [Providers](/platform/admin/providers).
+The three providers above are defaults, not requirements. Replace any of them with a different OpenAI-compatible endpoint by editing the JSON in `TALE_CONFIG_DIR/<orgSlug>/providers/` — point it at your own API, change the `models` array, and Tale reloads on next start. A local Ollama instance, a private vLLM cluster, or a Bedrock proxy all fit the same shape. The mechanics live under [Configuration → providers](/self-hosted/configuration/providers); the admin-UI form for the same config lives at [Providers](/platform/admin/providers).
 
 ## Where this fits
 
diff --git a/docs/fr/platform/integrations/overview.md b/docs/fr/platform/integrations/overview.md
index cd4d1ecefb..2e230a6686 100644
--- a/docs/fr/platform/integrations/overview.md
+++ b/docs/fr/platform/integrations/overview.md
@@ -65,7 +65,7 @@ Microsoft 365 couvre aussi l'identité. La connecter sous **Paramètres > Intég
 
 ## Ajouter une intégration personnalisée
 
-Les intégrations personnalisées suivent la même forme JSON que celles ci-dessus. Dépose une configuration dans `TALE_CONFIG_DIR/integrations/<slug>/config.json` déclarant les opérations, la méthode d'auth et les hôtes autorisés ; l'intégration apparaît sous **Paramètres > Intégrations** pour que les utilisateurs la connectent. La forme et les règles de validation vivent à côté des configurations livrées dans `examples/default/integrations/`.
+Les intégrations personnalisées suivent la même forme JSON que celles ci-dessus. Dépose une configuration dans `TALE_CONFIG_DIR/<orgSlug>/integrations/<slug>/config.json` déclarant les opérations, la méthode d'auth et les hôtes autorisés ; sous le layout org-first, le sous-arbre `integrations/` de chaque org est indépendant. L'intégration apparaît sous **Paramètres > Intégrations** pour que les utilisateurs la connectent. La forme et les règles de validation vivent à côté des configurations livrées dans `examples/default/integrations/`.
 
 Pour des ponts plus riches ou auto-hébergés, les [serveurs MCP](/fr/platform/integrations/mcp-servers) sont la surface alternative — chaque serveur MCP que tu enregistres ajoute ses outils à la ceinture d'outils de l'agent avec approbation par outil.
 
diff --git a/docs/fr/platform/models.md b/docs/fr/platform/models.md
index ba1180e796..178bba1038 100644
--- a/docs/fr/platform/models.md
+++ b/docs/fr/platform/models.md
@@ -15,7 +15,7 @@ Les modèles dérivent plus vite que la doc. Les listes ci-dessous sont correcte
 | **OpenAI**            | Reconnaissance et synthèse vocales | Whisper est la baseline pratique pour la transcription ; gpt-4o-mini-tts est le TTS fiable le moins cher                          | [platform.openai.com/docs/models](https://platform.openai.com/docs/models)     |
 | **Vercel AI Gateway** | Génération d'images                | Un seul endpoint compatible OpenAI couvre FLUX, Imagen et Nano Banana sans clé par fournisseur                                    | [vercel.com/docs/ai-gateway/models](https://vercel.com/docs/ai-gateway/models) |
 
-Chaque fournisseur ci-dessus est un endpoint compatible OpenAI que Tale appelle en HTTPS avec un bearer token. Tu peux remplacer chacun par un autre fournisseur (y compris un serveur Ollama ou vLLM local) en éditant le JSON correspondant sous `TALE_CONFIG_DIR/providers/` de ton instance.
+Chaque fournisseur ci-dessus est un endpoint compatible OpenAI que Tale appelle en HTTPS avec un bearer token. Tu peux remplacer chacun par un autre fournisseur (y compris un serveur Ollama ou vLLM local) en éditant le JSON correspondant sous `TALE_CONFIG_DIR/<orgSlug>/providers/` de ton instance — sous le layout org-first, les catalogues de fournisseurs sont par-org (chaque org a son propre sous-arbre `providers/`).
 
 ## OpenRouter — chat, vision, embeddings
 
@@ -35,7 +35,7 @@ OpenRouter est une passerelle multi-modèles. La configuration livrée choisit `
 - **Meta** — LLaMA 4 Maverick, LLaMA 4 Scout.
 - **Black Forest Labs** — FLUX.2 [max], FLUX.2 [pro], FLUX.2 [flex].
 
-Le catalogue complet et à jour vit sur [openrouter.ai/models](https://openrouter.ai/models). Tout modèle exposé par OpenRouter peut être ajouté à ton instance en éditant le tableau `models` dans `providers/openrouter.json` sous `TALE_CONFIG_DIR`.
+Le catalogue complet et à jour vit sur [openrouter.ai/models](https://openrouter.ai/models). Tout modèle exposé par OpenRouter peut être ajouté à ton instance en éditant le tableau `models` dans `<orgSlug>/providers/openrouter.json` sous `TALE_CONFIG_DIR` (par-org sous le layout org-first).
 
 ## OpenAI — reconnaissance et synthèse vocales
 
@@ -57,7 +57,7 @@ Le catalogue plus large vit sur [vercel.com/docs/ai-gateway/models](https://verc
 
 ## Échanger ou ajouter des fournisseurs
 
-Les trois fournisseurs ci-dessus sont des défauts, pas des obligations. Remplace chacun par un autre endpoint compatible OpenAI en éditant le JSON dans `TALE_CONFIG_DIR/providers/` — pointe-le vers ton API, change le tableau `models`, et Tale recharge au prochain démarrage. Une instance Ollama locale, un cluster vLLM privé, ou un proxy Bedrock entrent tous dans la même forme. La mécanique vit sous [Configuration → providers](/fr/self-hosted/configuration/providers) ; le formulaire UI admin pour la même configuration est sur [Providers](/fr/platform/admin/providers).
+Les trois fournisseurs ci-dessus sont des défauts, pas des obligations. Remplace chacun par un autre endpoint compatible OpenAI en éditant le JSON dans `TALE_CONFIG_DIR/<orgSlug>/providers/` — pointe-le vers ton API, change le tableau `models`, et Tale recharge au prochain démarrage. Une instance Ollama locale, un cluster vLLM privé, ou un proxy Bedrock entrent tous dans la même forme. La mécanique vit sous [Configuration → providers](/fr/self-hosted/configuration/providers) ; le formulaire UI admin pour la même configuration est sur [Providers](/fr/platform/admin/providers).
 
 ## Où ça s'inscrit
 
diff --git a/packages/tale_shared/src/tale_shared/config/org_slug.py b/packages/tale_shared/src/tale_shared/config/org_slug.py
new file mode 100644
index 0000000000..1c42587c96
--- /dev/null
+++ b/packages/tale_shared/src/tale_shared/config/org_slug.py
@@ -0,0 +1,35 @@
+"""Org slug validation shared across all Python services.
+
+Single source of truth so RAG, crawler, and `tale_shared.config.providers`
+agree on what counts as a legal slug. Keep in lockstep with
+`services/platform/lib/shared/constants/org-slug.ts`'s `ORG_SLUG_REGEX`.
+
+The regex protects file-system writes against:
+- `.` / `..` / absolute paths (e.g. `/etc/...`) — would silently rewrite
+  to legacy flat layout (`Path("/app/data") / "." / "providers"` →
+  `/app/data/providers`).
+- shell metacharacters that could leak into log lines or process
+  arguments.
+- empty / whitespace-only slugs.
+"""
+
+from __future__ import annotations
+
+import re
+
+ORG_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
+
+
+class InvalidOrgSlugError(ValueError):
+    """Raised when an `org_slug` arg violates the canonical shape."""
+
+
+def validate_org_slug(org_slug: str) -> str:
+    """Return `org_slug` if it matches `ORG_SLUG_RE`; raise otherwise.
+
+    Returns the slug unchanged so call sites can inline the check:
+    `providers_dir = base / validate_org_slug(org_slug) / "providers"`.
+    """
+    if not isinstance(org_slug, str) or not ORG_SLUG_RE.fullmatch(org_slug):
+        raise InvalidOrgSlugError(f"invalid org_slug {org_slug!r}: must match {ORG_SLUG_RE.pattern}")
+    return org_slug
diff --git a/packages/tale_shared/src/tale_shared/config/providers.py b/packages/tale_shared/src/tale_shared/config/providers.py
index b2075ec501..6db7831c32 100644
--- a/packages/tale_shared/src/tale_shared/config/providers.py
+++ b/packages/tale_shared/src/tale_shared/config/providers.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+from tale_shared.config.org_slug import validate_org_slug
 from tale_shared.utils.sops import decrypt_secrets_file
 
 logger = logging.getLogger(__name__)
@@ -53,8 +54,17 @@ def load_providers(
     Reads *.json (excluding *.secrets.json) and decrypts matching
     *.secrets.json files via SOPS.
     """
-    if not org_slug:
-        raise ValueError("load_providers requires a non-empty org_slug")
+    # Defense in depth: the FastAPI deps in RAG / crawler already gate
+    # `X-Tale-Org` against the same regex at request boundary, but
+    # internal callers in long-running tasks (e.g. crawler scheduler,
+    # vision hot paths) reach this function with a slug taken from
+    # `get_active_org()` or other module state. A `.` or `/etc` slipping
+    # in here would silently route to `<base>/providers` or `/etc/
+    # providers` via Path()'s "absolute resets" / "dot is a no-op"
+    # semantics — exactly the legacy-flat-layout class the org-first
+    # refactor exists to retire. Validate at the boundary, not just on
+    # the way in. (Round-2 P1-30.)
+    validate_org_slug(org_slug)
 
     shared_config = os.environ.get("TALE_PLATFORM_SHARED_CONFIG_DIR")
     if shared_config:
diff --git a/packages/ui/src/globals.css b/packages/ui/src/globals.css
index cbd1fcd961..9dac355328 100644
--- a/packages/ui/src/globals.css
+++ b/packages/ui/src/globals.css
@@ -268,13 +268,6 @@
     height: 100%;
   }
 
-  /* Suppress the document-level rubber-band / pull-to-refresh so the app shell
-     feels native. Inner overflow-y-auto regions still scroll normally. */
-  html,
-  body {
-    overscroll-behavior: none;
-  }
-
   body {
     font-family: 'Inter', ui-sans-serif, system-ui, sans-serif;
     text-rendering: optimizeLegibility;
diff --git a/services/crawler/app/org_context.py b/services/crawler/app/org_context.py
index 48930a9f14..a707c876d4 100644
--- a/services/crawler/app/org_context.py
+++ b/services/crawler/app/org_context.py
@@ -16,14 +16,10 @@
 500, not as "served the wrong org's models for an hour".
 """
 
-import re
 from contextvars import ContextVar
 
 from fastapi import Header, HTTPException, status
-
-# Aligned with services/platform/convex/lib/file_io.ts:25; capped at 64 chars
-# to match tools/cli/src/lib/migrate-config-layout/script.sh:134.
-_ORG_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
+from tale_shared.config.org_slug import ORG_SLUG_RE
 
 _active_org: ContextVar[str | None] = ContextVar("tale_active_org", default=None)
 
@@ -60,7 +56,7 @@ async def require_org_slug(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="missing X-Tale-Org header",
         )
-    if not _ORG_SLUG_RE.match(x_tale_org):
+    if not ORG_SLUG_RE.match(x_tale_org):
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="invalid X-Tale-Org header",
diff --git a/services/platform/app/locals.css b/services/platform/app/locals.css
index 8e53e66eb4..9da1666344 100644
--- a/services/platform/app/locals.css
+++ b/services/platform/app/locals.css
@@ -10,3 +10,11 @@ body,
   margin: 0;
   overflow: clip;
 }
+
+/* Suppress the document-level rubber-band / pull-to-refresh so the app shell
+   feels native. Inner overflow-y-auto regions still scroll normally.
+   Platform-only: the docs and web sites want native rubber-band scroll. */
+html,
+body {
+  overscroll-behavior: none;
+}
diff --git a/services/platform/convex/agents/file_actions.ts b/services/platform/convex/agents/file_actions.ts
index 79fd2190f6..aea145bb61 100644
--- a/services/platform/convex/agents/file_actions.ts
+++ b/services/platform/convex/agents/file_actions.ts
@@ -30,6 +30,7 @@ import {
 import {
   atomicWrite,
   generateHistoryTimestamp,
+  handleDirReadError,
   pruneHistory,
   readFileSafe,
   readJsonFile,
@@ -167,7 +168,8 @@ export const listAgents = action({
     let entries: string[];
     try {
       entries = await readdir(dir);
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'agents.listAgents');
       return [];
     }
 
@@ -491,7 +493,8 @@ export const duplicateAgent = action({
     let entries: string[];
     try {
       entries = await readdir(dir);
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'agents.duplicateAgent');
       entries = [];
     }
     const existingNames = new Set(
@@ -652,7 +655,8 @@ export const listHistory = action({
     let entries: string[];
     try {
       entries = await readdir(historyDir);
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'agents.listHistory');
       return [];
     }
 
diff --git a/services/platform/convex/agents/internal_actions.ts b/services/platform/convex/agents/internal_actions.ts
index a3ce9473af..adbc77fcd5 100644
--- a/services/platform/convex/agents/internal_actions.ts
+++ b/services/platform/convex/agents/internal_actions.ts
@@ -8,7 +8,7 @@ import { isRecord, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
 import { getPollingInterval } from '../documents/internal_actions';
-import { readJsonFile } from '../lib/file_io';
+import { handleDirReadError, readJsonFile } from '../lib/file_io';
 import { orgSlugFromId } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { deleteDocumentById } from '../workflow_engine/action_defs/rag/helpers/delete_document';
@@ -285,7 +285,8 @@ export const listAgentsInternal = internalAction({
     let entries: string[];
     try {
       entries = await readdir(dir);
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'agents.listAgentsInternal');
       return [];
     }
 
diff --git a/services/platform/convex/governance/retention_bounds_proposal.ts b/services/platform/convex/governance/retention_bounds_proposal.ts
index c160c8bda9..034a192ae7 100644
--- a/services/platform/convex/governance/retention_bounds_proposal.ts
+++ b/services/platform/convex/governance/retention_bounds_proposal.ts
@@ -69,6 +69,12 @@ async function loadOrgRetentionConfig(
  * `{category: {min, max}}` shape that goes into `retentionAppliedBounds`
  * + the hash. The full `EffectiveBoundDef` (with env-binding detail and
  * display metadata) is for the banner UI, not the snapshot.
+ *
+ * Both failure modes — operator never installed the file, AND env
+ * tightening references a category the file doesn't declare — surface
+ * as the SAME `ConvexError({code: 'RETENTION_CONFIG_MISSING'})`. Lifting
+ * the translation here keeps every caller's body straight-line — the
+ * earlier pattern had four duplicated try/catches around this call.
  */
 async function computeEffectiveAppliedBounds(
   ctx: ActionCtx,
@@ -82,7 +88,19 @@ async function computeEffectiveAppliedBounds(
         'Retention config not yet installed. Copy examples/default/retention.json to $TALE_CONFIG_DIR/default/retention.json then reload.',
     });
   }
-  const all = applyEnvTighteningAll(orgConfig);
+  let all;
+  try {
+    all = applyEnvTighteningAll(orgConfig);
+  } catch (err) {
+    if (err instanceof RetentionConfigMissingError) {
+      throw new ConvexError({
+        code: 'RETENTION_CONFIG_MISSING',
+        category: err.category,
+        message: err.message,
+      });
+    }
+    throw err;
+  }
   const out: AppliedBoundsByCategory = {};
   for (const def of all) {
     out[def.category] = { min: def.min, max: def.max };
@@ -285,19 +303,7 @@ export const getPendingBoundsProposal = action({
     });
 
     const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
-    let proposedBounds: AppliedBoundsByCategory;
-    try {
-      proposedBounds = await computeEffectiveAppliedBounds(ctx, orgSlug);
-    } catch (err) {
-      if (err instanceof RetentionConfigMissingError) {
-        throw new ConvexError({
-          code: 'RETENTION_CONFIG_MISSING',
-          category: err.category,
-          message: err.message,
-        });
-      }
-      throw err;
-    }
+    const proposedBounds = await computeEffectiveAppliedBounds(ctx, orgSlug);
     const proposedHash = await hashAppliedBounds(proposedBounds);
 
     const applied: {
@@ -532,10 +538,15 @@ export const seedInitialBoundsInternal = internalAction({
     try {
       proposedBounds = await computeEffectiveAppliedBounds(ctx, orgSlug);
     } catch (err) {
-      if (err instanceof ConvexError) {
-        // Config-missing — operator hasn't installed the file. Log
-        // loudly so the seed retry path is obvious and bail (cleanup
-        // already skips orgs without applied rows).
+      // Seeder runs for every org during migration; orgs that haven't
+      // installed the retention config yet are expected to be skipped
+      // (cleanup already skips orgs without applied rows). Other errors
+      // are real bugs — propagate.
+      if (
+        err instanceof ConvexError &&
+        isRecord(err.data) &&
+        err.data.code === 'RETENTION_CONFIG_MISSING'
+      ) {
         console.warn(
           `[seedInitialBoundsInternal] org ${args.organizationId}: ${err.message}`,
         );
diff --git a/services/platform/convex/lib/file_io.ts b/services/platform/convex/lib/file_io.ts
index c2746b0e35..0a55cb8cd9 100644
--- a/services/platform/convex/lib/file_io.ts
+++ b/services/platform/convex/lib/file_io.ts
@@ -54,6 +54,29 @@ function isFileNotFound(err: unknown): boolean {
  * "permission denied" / "I/O error" (always worth logging) without
  * duplicating the property-check ceremony.
  */
+/**
+ * Apply the providers/-style ENOENT-vs-other discrimination to a
+ * `readdir` error. ENOENT is the legitimate "directory doesn't exist
+ * yet" case — every list-domain endpoint treats it as an empty result.
+ * Any other errno (EACCES, EIO, EISDIR, …) means the operator misconfigured
+ * the volume mount or there's a real fs problem; silently returning `[]`
+ * makes the bug invisible. Log with a label so the source is identifiable
+ * in `docker logs` and surface as an empty list to the caller so the
+ * route still responds.
+ *
+ * Used to replace silent `catch {}` blocks at:
+ *  - convex/agents/file_actions.ts (listAgents, duplicateAgent, listHistory)
+ *  - convex/agents/internal_actions.ts (listAgentsInternal)
+ *  - convex/workflows/file_actions.ts (listWorkflowsInternal, getAvailableWorkflows)
+ */
+export function handleDirReadError(err: unknown, label: string): void {
+  if (errnoCode(err) === 'ENOENT') return;
+  console.warn(
+    `[${label}] readdir failed:`,
+    err instanceof Error ? err.message : err,
+  );
+}
+
 export function errnoCode(err: unknown): string | undefined {
   if (err instanceof Error && 'code' in err && typeof err.code === 'string') {
     return err.code;
diff --git a/services/platform/convex/workflows/file_actions.ts b/services/platform/convex/workflows/file_actions.ts
index 9562d0f512..472b9df554 100644
--- a/services/platform/convex/workflows/file_actions.ts
+++ b/services/platform/convex/workflows/file_actions.ts
@@ -22,6 +22,7 @@ import { authComponent } from '../auth';
 import {
   atomicWrite,
   generateHistoryTimestamp,
+  handleDirReadError,
   pruneHistory,
   readFileSafe,
   readJsonFile,
@@ -705,7 +706,8 @@ export const listWorkflowsForAgent = internalAction({
     let raw;
     try {
       raw = await readdir(dir, { recursive: true, withFileTypes: true });
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'workflows.listWorkflowsForAgent');
       return [];
     }
 
@@ -771,7 +773,8 @@ export const getAvailableWorkflows = action({
     let raw;
     try {
       raw = await readdir(dir, { recursive: true, withFileTypes: true });
-    } catch {
+    } catch (err) {
+      handleDirReadError(err, 'workflows.getAvailableWorkflows');
       return [];
     }
 
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index 337cfea0aa..1b30c8e418 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -11,7 +11,8 @@ interface ConfigChangeEvent {
     | 'integrations'
     | 'providers'
     | 'branding'
-    | 'skills';
+    | 'skills'
+    | 'retention';
   orgSlug?: string;
   slug?: string;
 }
@@ -60,6 +61,20 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   const orgSlug = parts[0];
   if (!ORG_SLUG_REGEX.test(orgSlug)) return null;
 
+  // Single-file-per-org configs sit at `<org>/<stem>.json` (currently
+  // just `retention.json`; future-proof for `quota.json` etc.). Without
+  // this branch they fell through to null, so operator edits to
+  // `<org>/retention.json` never invalidated the governance UI cache
+  // (round-2 P1-15). Emit at slug=stem granularity so consumers can
+  // key their cache invalidation on it.
+  if (parts.length === 2 && parts[1].endsWith('.json')) {
+    const stem = parts[1].slice(0, -'.json'.length);
+    if (stem === 'retention') {
+      return { type: 'retention', orgSlug, slug: stem };
+    }
+    return null;
+  }
+
   const domain = parts[1];
 
   if (domain === 'branding') {
diff --git a/services/rag/app/auth.py b/services/rag/app/auth.py
index dc46007a46..d3a23f8af0 100644
--- a/services/rag/app/auth.py
+++ b/services/rag/app/auth.py
@@ -12,18 +12,13 @@
 """
 
 import hmac
-import re
 
 from fastapi import Header, HTTPException, status
 from loguru import logger
+from tale_shared.config.org_slug import ORG_SLUG_RE
 
 from .config import settings
 
-# Org-slug regex aligned with services/platform/convex/lib/file_io.ts:25
-# plus the literal "default". Capped at 64 chars to match the platform's
-# migrate-script regex (script.sh:134). Keep these in sync.
-_ORG_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
-
 
 def _extract_bearer(header_value: str | None) -> str | None:
     if not header_value:
@@ -74,7 +69,7 @@ async def require_org_slug(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="missing X-Tale-Org header",
         )
-    if not _ORG_SLUG_RE.match(x_tale_org):
+    if not ORG_SLUG_RE.match(x_tale_org):
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="invalid X-Tale-Org header",

From d0878b88daf17afe2e57d9355982af15dc21cec8 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Thu, 28 May 2026 23:08:40 +0800
Subject: [PATCH 16/41] build(crawler,rag): bump uv HTTP timeout to 300s for
 large wheels

Default ~30s timeout causes transient PyPI fetch failures on slow links
when pulling large wheels (scipy, playwright/patchright, ML libs).
---
 services/crawler/Dockerfile | 5 +++++
 services/rag/Dockerfile     | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/services/crawler/Dockerfile b/services/crawler/Dockerfile
index fdeaf8591c..0214ee7c0b 100644
--- a/services/crawler/Dockerfile
+++ b/services/crawler/Dockerfile
@@ -35,6 +35,11 @@ WORKDIR /app
 # Install uv for faster package management
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 
+# Network resilience for PyPI: default uv timeout (~30s) is too aggressive for
+# large wheels (scipy ~34 MB, playwright/patchright ~45 MB each) on slow links.
+# Bumping to 300s prevents transient timeouts from failing the whole build.
+ENV UV_HTTP_TIMEOUT=300
+
 # Copy and install shared packages first (better layer caching)
 COPY packages/tale_shared/pyproject.toml /packages/tale_shared/pyproject.toml
 COPY packages/tale_shared/src/ /packages/tale_shared/src/
diff --git a/services/rag/Dockerfile b/services/rag/Dockerfile
index 81d6906be3..6ff9b9968c 100644
--- a/services/rag/Dockerfile
+++ b/services/rag/Dockerfile
@@ -31,6 +31,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install uv for faster package management
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 
+# Network resilience for PyPI: default uv timeout (~30s) is too aggressive for
+# large ML wheels on slow links — bump to 300s to survive transient slowdowns.
+ENV UV_HTTP_TIMEOUT=300
+
 # Copy and install shared packages first (better layer caching)
 COPY packages/tale_shared/pyproject.toml /packages/tale_shared/pyproject.toml
 COPY packages/tale_shared/src/ /packages/tale_shared/src/

From b2e840a089f0b68fcb539ed7f3c9647b59d64831 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:35:23 +0800
Subject: [PATCH 17/41] fix(platform): close cross-tenant writes in agents
 knowledge mutations

addKnowledgeFile and removeKnowledgeFile accepted a caller-supplied
fileId: v.id('_storage') after verifying only org membership, then
called ctx.storage.delete / saveFileMetadata / indexKnowledgeFile
against that fileId. Convex _storage is deployment-global and the
fileMetadata by_storageId index is not org-scoped, so an authenticated
member of org A could pass an org B storageId and delete the blob,
patch the foreign metadata row, or schedule indexing against B's blob.

removeKnowledgeFile now requires the fileId to be present in the
org-scoped agentBindings.knowledgeFiles array before touching anything.
addKnowledgeFile cross-checks via fileMetadata: a foreign storageId
with metadata in another org is rejected. Both paths share an opaque
'file_not_in_org' ConvexError to avoid cross-org existence probing.

cleanupAgentBinding remains safe by construction (it iterates only
this binding's knowledgeFiles, which addKnowledgeFile's new gate
keeps clean of foreign fileIds).
---
 services/platform/convex/agents/mutations.ts | 40 +++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/services/platform/convex/agents/mutations.ts b/services/platform/convex/agents/mutations.ts
index 4fc514e8a7..96bf5d945b 100644
--- a/services/platform/convex/agents/mutations.ts
+++ b/services/platform/convex/agents/mutations.ts
@@ -6,15 +6,40 @@
  * references: team assignment and knowledge files with storage IDs.
  */
 
-import { v } from 'convex/values';
+import { ConvexError, v } from 'convex/values';
 
 import { internal } from '../_generated/api';
+import type { Id } from '../_generated/dataModel';
+import type { MutationCtx } from '../_generated/server';
 import { internalMutation, mutation } from '../_generated/server';
 import { authComponent } from '../auth';
 import { extractExtension } from '../documents/extract_extension';
 import { getOrganizationMember } from '../lib/rls';
 import { knowledgeFileValidator } from './schema';
 
+/**
+ * Convex `_storage` is a deployment-global namespace and the `fileMetadata`
+ * `by_storageId` index is not org-scoped. Public mutations that take a caller-
+ * supplied `fileId` must cross-check the storageId against fileMetadata before
+ * touching storage / scheduling RAG work — otherwise a member of org A can
+ * supply an org B storageId and trigger writes against org B's blob/row.
+ */
+async function assertStorageIdInOrg(
+  ctx: MutationCtx,
+  organizationId: string,
+  storageId: Id<'_storage'>,
+): Promise<void> {
+  const meta = await ctx.db
+    .query('fileMetadata')
+    .withIndex('by_storageId', (q) => q.eq('storageId', storageId))
+    .first();
+  if (meta && meta.organizationId !== organizationId) {
+    // Same opaque message in both refusal paths so a caller cannot probe
+    // whether a foreign storageId exists in some other org.
+    throw new ConvexError('file_not_in_org');
+  }
+}
+
 export const upsertBinding = internalMutation({
   args: {
     organizationId: v.string(),
@@ -167,6 +192,8 @@ export const addKnowledgeFile = mutation({
       name: authUser.name,
     });
 
+    await assertStorageIdInOrg(ctx, args.organizationId, args.fileId);
+
     await ctx.runMutation(
       internal.file_metadata.internal_mutations.saveFileMetadata,
       {
@@ -252,6 +279,17 @@ export const removeKnowledgeFile = mutation({
       )
       .first();
 
+    // The fileId must be present in this org's binding. Storage + metadata
+    // deletes below are global by storageId, so trusting the caller-supplied
+    // fileId without proving org-ownership lets org A wipe org B's blobs.
+    const inBinding = (binding?.knowledgeFiles ?? []).some(
+      (f) => f.fileId === args.fileId,
+    );
+    if (!inBinding) throw new ConvexError('file_not_in_org');
+
+    // Defense-in-depth: also confirm fileMetadata (if any) is org-scoped.
+    await assertStorageIdInOrg(ctx, args.organizationId, args.fileId);
+
     if (binding) {
       const filtered = (binding.knowledgeFiles ?? []).filter(
         (f) => f.fileId !== args.fileId,

From 77ae1bd701367c27f26a5d352eb6ef616bb885b5 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:40:41 +0800
Subject: [PATCH 18/41] fix(platform): close P1 cross-org gaps in governance,
 SSE, and history
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. governance/erasure.ts — eraseSubjectTwoFactorAttempts and
   eraseSubjectLoginAttempts wiped global-key tables (twoFactorAttempts
   by userId; loginAttempts/loginBlockCounters by email). For a
   multi-org subject, an admin of org A filing erasure would silently
   reset the subject's 2FA backoff + login lockout state for every
   other org they belong to — a cross-tenant auth-throttling bypass
   primitive. Gate both wipes via new subjectIsMemberOfOtherActiveOrgs
   helper: when the subject still belongs to another active (non-
   disabled) org, skip the wipe and log a partial-outcome warning.
   The other org's auth-state remains intact; the last-org admin's
   erasure runs the cleanup.

2. http.ts /api/sse/auth — the membership lookup didn't filter
   role === 'disabled'. A soft-removed member kept receiving SSE
   file-change events for the org they were kicked from. Now matches
   the canonical getUserOrganizations filter.

3. agents/file_actions.ts — readHistoryEntry and restoreFromHistory
   used resolved.startsWith(path.resolve(historyDir)) without a
   trailing path.sep and never validated args.timestamp shape. A
   crafted timestamp containing '../' or referencing a sibling
   agent's history dir whose path string starts with the original
   prefix could escape the agent's history scope. Replaced with
   safeJoinWithinDir + validateTimestamp from lib/file_io (the same
   helpers the rest of the agents module uses).
---
 .../platform/convex/agents/file_actions.ts    | 22 +++---
 .../platform/convex/governance/erasure.ts     | 70 +++++++++++++++++++
 services/platform/convex/http.ts              |  7 ++
 3 files changed, 87 insertions(+), 12 deletions(-)

diff --git a/services/platform/convex/agents/file_actions.ts b/services/platform/convex/agents/file_actions.ts
index aea145bb61..b891697cf0 100644
--- a/services/platform/convex/agents/file_actions.ts
+++ b/services/platform/convex/agents/file_actions.ts
@@ -34,7 +34,9 @@ import {
   pruneHistory,
   readFileSafe,
   readJsonFile,
+  safeJoinWithinDir,
   sha256,
+  validateTimestamp,
 } from '../lib/file_io';
 import { stripNulls } from '../lib/strip_nulls';
 import { resolveOrgSlug } from '../organizations/resolve_org_slug';
@@ -683,13 +685,11 @@ export const readHistoryEntry = action({
       ctx,
       args.organizationId,
     );
-    const historyDir = resolveHistoryDir(orgSlug, args.agentName);
-    const filePath = path.join(historyDir, `${args.timestamp}.json`);
-
-    const resolved = path.resolve(filePath);
-    if (!resolved.startsWith(path.resolve(historyDir))) {
-      throw new Error('Path traversal detected');
+    if (!validateTimestamp(args.timestamp)) {
+      throw new Error('Invalid timestamp');
     }
+    const historyDir = resolveHistoryDir(orgSlug, args.agentName);
+    const filePath = safeJoinWithinDir(historyDir, `${args.timestamp}.json`);
 
     const content = await readFileSafe(filePath);
     if (!content) {
@@ -719,15 +719,13 @@ export const restoreFromHistory = action({
   handler: async (ctx, args): Promise<{ hash: string }> => {
     const auth = await requireOrgMembershipById(ctx, args.organizationId);
     const { orgSlug } = auth;
+    if (!validateTimestamp(args.timestamp)) {
+      throw new Error('Invalid timestamp');
+    }
     const historyDir = resolveHistoryDir(orgSlug, args.agentName);
-    const historyPath = path.join(historyDir, `${args.timestamp}.json`);
+    const historyPath = safeJoinWithinDir(historyDir, `${args.timestamp}.json`);
     const agentPath = resolveAgentFilePath(orgSlug, args.agentName);
 
-    const resolved = path.resolve(historyPath);
-    if (!resolved.startsWith(path.resolve(historyDir))) {
-      throw new Error('Path traversal detected');
-    }
-
     const historyContent = await readFileSafe(historyPath);
     if (!historyContent) throw new Error('History entry not found');
 
diff --git a/services/platform/convex/governance/erasure.ts b/services/platform/convex/governance/erasure.ts
index 7a8dff8637..17fd01d6d1 100644
--- a/services/platform/convex/governance/erasure.ts
+++ b/services/platform/convex/governance/erasure.ts
@@ -53,6 +53,7 @@
 import { ConvexError, v } from 'convex/values';
 
 import { components, internal } from '../_generated/api';
+import type { MutationCtx } from '../_generated/server';
 import {
   internalAction,
   internalMutation,
@@ -95,6 +96,40 @@ const erasureReasonCodeValidator = v.union(
   ...ERASURE_REASON_CODES.map((c) => v.literal(c)),
 );
 
+/**
+ * True when the subject still has an active (non-disabled) membership in
+ * any organization other than `excludeOrgId`. Used to gate global-key
+ * wipes (`twoFactorAttempts`, `loginAttempts`, `loginBlockCounters`)
+ * during a per-org erasure: those tables are keyed globally on userId or
+ * email, so wiping them on org A's erasure of a multi-org subject would
+ * reset the user's auth-throttling state for every other org they belong
+ * to — a lockout-counter / 2FA-backoff bypass primitive.
+ */
+async function subjectIsMemberOfOtherActiveOrgs(
+  ctx: MutationCtx,
+  userId: string,
+  excludeOrgId: string,
+): Promise<boolean> {
+  const result = await ctx.runQuery(components.betterAuth.adapter.findMany, {
+    model: 'member',
+    // 256 caps protection at the SSE auth route; same cap here so the
+    // worst-case row scan stays bounded. Anyone with that many memberships
+    // is an operator account, not a real subject.
+    paginationOpts: { cursor: null, numItems: 256 },
+    where: [{ field: 'userId', value: userId, operator: 'eq' }],
+  });
+  const rows: unknown[] = Array.isArray(result?.page) ? result.page : [];
+  for (const row of rows) {
+    if (typeof row !== 'object' || row === null) continue;
+    const r = row as { organizationId?: unknown; role?: unknown };
+    if (typeof r.organizationId !== 'string') continue;
+    if (r.organizationId === excludeOrgId) continue;
+    if (typeof r.role === 'string' && r.role === 'disabled') continue;
+    return true;
+  }
+  return false;
+}
+
 /**
  * Clear the per-subject `activeErasureClaims.requestId` IFF the claim
  * still points at the row identified by `requestId`. Defensive against
@@ -1202,6 +1237,23 @@ export const eraseSubjectTwoFactorAttempts = internalMutation({
   args: { organizationId: v.string(), userId: v.string() },
   returns: v.object({ rows: v.number(), skippedByHold: v.number() }),
   handler: async (ctx, args) => {
+    // Multi-org gate: refuse to wipe global auth-state when the subject
+    // still has an active membership in another org. Otherwise an admin
+    // of org A erasing a multi-org user would silently reset every other
+    // org's 2FA backoff counter for that user.
+    if (
+      await subjectIsMemberOfOtherActiveOrgs(
+        ctx,
+        args.userId,
+        args.organizationId,
+      )
+    ) {
+      console.warn(
+        '[erasure] skipping twoFactorAttempts wipe: subject is member of other active orgs',
+        { userId: args.userId, requestingOrgId: args.organizationId },
+      );
+      return { rows: 0, skippedByHold: 0 };
+    }
     const iter = () =>
       ctx.db
         .query('twoFactorAttempts')
@@ -1576,6 +1628,24 @@ export const eraseSubjectLoginAttempts = internalMutation({
     skippedByHold: v.number(),
   }),
   handler: async (ctx, args) => {
+    // Multi-org gate: refuse to wipe global email-keyed auth state when
+    // the subject still has an active membership in another org. The
+    // subject's loginAttempts / loginBlockCounters protect every org
+    // they belong to; clearing them on a single org's erasure would
+    // grant a lockout-counter bypass primitive across tenants.
+    if (
+      await subjectIsMemberOfOtherActiveOrgs(
+        ctx,
+        args.userId,
+        args.organizationId,
+      )
+    ) {
+      console.warn(
+        '[erasure] skipping loginAttempts wipe: subject is member of other active orgs',
+        { userId: args.userId, requestingOrgId: args.organizationId },
+      );
+      return { attempts: 0, blockCounters: 0, skippedByHold: 0 };
+    }
     const lower = args.email.toLowerCase();
     const attemptsIter = () =>
       ctx.db
diff --git a/services/platform/convex/http.ts b/services/platform/convex/http.ts
index fcc7af544f..a0b8b7ee7d 100644
--- a/services/platform/convex/http.ts
+++ b/services/platform/convex/http.ts
@@ -343,7 +343,14 @@ http.route({
     const memberRows: unknown[] = Array.isArray(memberships?.page)
       ? memberships.page
       : [];
+    // Drop rows where the user is soft-removed via `role = 'disabled'`
+    // (matches the canonical filter in lib/rls/organization/get_user_organizations.ts).
+    // Without this filter, a disabled member keeps receiving SSE file events
+    // for the org they were kicked from until the row is hard-deleted.
     const orgIds: string[] = memberRows
+      .filter((row) =>
+        isRecord(row) ? getString(row, 'role') !== 'disabled' : false,
+      )
       .map((row) =>
         isRecord(row) ? getString(row, 'organizationId') : undefined,
       )

From f74b1b89605d3cdb6d958e66e6a4eaaf65665384 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:42:15 +0800
Subject: [PATCH 19/41] fix(crawler): close P1 cross-tenant race +
 scan_interval contract drift
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. execute_delete (pg_website_store.py) ran a same-tx membership COUNT
   then DELETE under READ COMMITTED with no parent-row lock. A
   concurrent register_website from another org could insert a fresh
   membership row between the COUNT and the DELETE; the FK CASCADE
   on website_org_memberships(domain) would then silently wipe the
   new org's just-inserted membership. Take a row-level lock via
   SELECT ... FROM websites WHERE domain = $1 FOR UPDATE before the
   COUNT, forcing concurrent register_website's ON CONFLICT DO UPDATE
   to serialize.

2. register_website returned the request's scan_interval verbatim even
   though ON CONFLICT preserves the originally-stored value (first-org
   sets cadence). A second org joining the same domain with a different
   cadence saw their input echoed while the scheduler kept running on
   the first org's interval — silent contract drift. Surface the
   stored value via RETURNING and echo it back through the router.
---
 services/crawler/app/routers/websites.py      | 14 ++++++--
 .../crawler/app/services/pg_website_store.py  | 34 ++++++++++++++++---
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/services/crawler/app/routers/websites.py b/services/crawler/app/routers/websites.py
index a51ea850a9..1e4ce03f2a 100644
--- a/services/crawler/app/routers/websites.py
+++ b/services/crawler/app/routers/websites.py
@@ -97,10 +97,20 @@ async def register_website(request: RegisterWebsiteRequest, http_request: Reques
         if result.get("first_membership"):
             trigger_scan()
 
+        # Echo the *stored* scan_interval, not the request's. ON CONFLICT
+        # preserves the existing cadence (P1-22 / round-3 P1) so the second
+        # org to register a domain with a different cadence would otherwise
+        # be told their value was accepted when in fact the first org's
+        # cadence remains in force.
+        stored_interval = int(
+            result.get("scan_interval", request.scan_interval),
+        )
         return WebsiteInfoResponse(
             domain=request.domain,
-            status="scanning" if result.get("first_membership") else (website.get("status") if website else "idle"),
-            scan_interval=request.scan_interval,
+            status="scanning"
+            if result.get("first_membership")
+            else (website.get("status") if website else result.get("status", "idle")),
+            scan_interval=stored_interval,
         )
     except HTTPException:
         raise
diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py
index 5723e6a5f7..c5a8d42eae 100644
--- a/services/crawler/app/services/pg_website_store.py
+++ b/services/crawler/app/services/pg_website_store.py
@@ -280,7 +280,12 @@ async def register_website(
             # to 'idle' if a stuck-delete recovery left it at 'deleting'
             # — a fresh registration is a clear signal the domain is
             # wanted again (round-2 P1-23).
-            await conn.execute(
+            # ON CONFLICT preserves the existing scan_interval — first-org
+            # sets cadence; subsequent orgs joining a tracked domain keep
+            # whatever the first org configured (P1-22). RETURNING surfaces
+            # the *stored* scan_interval so the caller can echo the truth
+            # back instead of repeating the request param (round-3 P1).
+            website_row = await conn.fetchrow(
                 """INSERT INTO websites (domain, scan_interval, created_at, updated_at)
                    VALUES ($1, $2, NOW(), NOW())
                    ON CONFLICT(domain) DO UPDATE SET
@@ -288,10 +293,13 @@ async def register_website(
                        WHEN websites.status = 'deleting' THEN 'idle'
                        ELSE websites.status
                      END,
-                     updated_at = NOW()""",
+                     updated_at = NOW()
+                   RETURNING scan_interval, status""",
                 domain,
                 scan_interval,
             )
+            stored_scan_interval = int(website_row["scan_interval"]) if website_row else scan_interval
+            stored_status = str(website_row["status"]) if website_row else "idle"
             # ON CONFLICT DO NOTHING — re-registering from the same org is a no-op.
             # `xmax = 0` is true on a row INSERTed in this command; non-zero on
             # an existing row that hit ON CONFLICT. We use it to tell the caller
@@ -311,16 +319,17 @@ async def register_website(
             )
         first_membership = membership_inserted and total_members == 1
         logger.info(
-            "Registered website: %s for org=%s (interval=%ss, first_membership=%s)",
+            "Registered website: %s for org=%s (requested_interval=%ss, stored_interval=%ss, first_membership=%s)",
             domain,
             org_slug,
             scan_interval,
+            stored_scan_interval,
             first_membership,
         )
         return {
             "domain": domain,
-            "scan_interval": scan_interval,
-            "status": "idle",
+            "scan_interval": stored_scan_interval,
+            "status": stored_status,
             "first_membership": first_membership,
         }
 
@@ -412,9 +421,24 @@ async def execute_delete(self, domain: str) -> None:
         (whose ON CONFLICT now resets status='idle' — see P1-22 fix
         above). If any membership now exists, abort the DELETE rather
         than CASCADE-killing the new org's content.
+
+        Round-3 P1: take a row-level lock on the parent `websites` row
+        before the membership COUNT. Without it the COUNT runs under
+        READ COMMITTED with no lock held, so a concurrent
+        `register_website` from a different org could insert a fresh
+        membership row between this COUNT and the DELETE — the CASCADE
+        FK on `website_org_memberships(domain)` would then silently
+        wipe the new org's just-inserted membership. SELECT … FOR UPDATE
+        forces the concurrent INSERT's ON CONFLICT DO UPDATE on
+        `websites` to block until we commit (or roll back, in which
+        case the membership remains and our COUNT sees it).
         """
         async with acquire_with_retry(self._pool) as conn, conn.transaction():
             await conn.execute("SET LOCAL statement_timeout = '120s'")
+            await conn.execute(
+                "SELECT 1 FROM websites WHERE domain = $1 FOR UPDATE",
+                domain,
+            )
             remaining = await conn.fetchval(
                 "SELECT COUNT(*) FROM website_org_memberships WHERE domain = $1",
                 domain,

From bf9ad85ebb70fd78b12b2adfce173a4fb97b6b09 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:44:51 +0800
Subject: [PATCH 20/41] fix(platform): align org-slug length cap across TS /
 Convex / UI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ORG_SLUG_REGEX was unbounded (/^[a-z0-9][a-z0-9_-]*$/) while the
Python validator at packages/tale_shared/.../org_slug.py is capped at
64 chars (/^[a-z0-9][a-z0-9_-]{0,63}$/, fullmatch). A long display
name went through deriveOrgSlug → Better Auth → organization row with
no length check; the first RAG / crawler call then 400'd on
require_org_slug forever — the org was bricked with no recovery.

- Add MAX_ORG_SLUG_LENGTH = 64 export.
- Tighten ORG_SLUG_REGEX to {0,63} so assertValidOrgSlug
  (called from beforeCreateOrganization + beforeUpdateOrganization
  in convex/auth.ts) rejects long slugs at the Better Auth hook.
- Truncate deriveOrgSlug in organization-form.tsx so the UI
  preview matches what will actually be persisted.

All four layers (TS regex, Convex hook, Python validator, UI form)
now enforce the same 64-char ceiling.
---
 .../components/organization-form.tsx             |  9 +++++++--
 .../platform/lib/shared/constants/org-slug.ts    | 16 ++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/services/platform/app/features/organization/components/organization-form.tsx b/services/platform/app/features/organization/components/organization-form.tsx
index 0dc1d23938..d60685a339 100644
--- a/services/platform/app/features/organization/components/organization-form.tsx
+++ b/services/platform/app/features/organization/components/organization-form.tsx
@@ -20,6 +20,7 @@ import { toast } from '@/app/hooks/use-toast';
 import { api } from '@/convex/_generated/api';
 import { authClient } from '@/lib/auth-client';
 import { useT } from '@/lib/i18n/client';
+import { MAX_ORG_SLUG_LENGTH } from '@/lib/shared/constants/org-slug';
 import { isReservedOrgSlug } from '@/lib/shared/constants/reserved-org-slugs';
 
 import { useInitializeDefaultWorkflows } from '../hooks/actions';
@@ -35,14 +36,18 @@ type FormData = { name: string };
  *
  * Must produce a slug that matches
  * `services/platform/lib/shared/constants/org-slug.ts` ORG_SLUG_REGEX —
- * see `assertValidOrgSlug`.
+ * see `assertValidOrgSlug`. Truncates to `MAX_ORG_SLUG_LENGTH` so a
+ * long display name doesn't mint a slug that RAG/crawler's Python
+ * validator (capped at 64 chars) would reject — that path causes
+ * total feature loss for the org with no recovery.
  */
 function deriveOrgSlug(name: string): string {
   return name
     .trim()
     .toLowerCase()
     .replace(/[^a-z0-9]+/g, '-')
-    .replace(/^-+|-+$/g, '');
+    .replace(/^-+|-+$/g, '')
+    .slice(0, MAX_ORG_SLUG_LENGTH);
 }
 
 export function OrganizationForm() {
diff --git a/services/platform/lib/shared/constants/org-slug.ts b/services/platform/lib/shared/constants/org-slug.ts
index 3f94ce140a..ac44715457 100644
--- a/services/platform/lib/shared/constants/org-slug.ts
+++ b/services/platform/lib/shared/constants/org-slug.ts
@@ -11,22 +11,26 @@
  * Rules:
  *   - Must start with a lowercase letter or digit
  *   - Body may include lowercase letters, digits, `_`, `-`
- *   - `'default'` is allowed as the reserved platform-seed org slug
- *     even though every other check would still pass it; the explicit
- *     short-circuit documents the invariant.
+ *   - Length capped at {@link MAX_ORG_SLUG_LENGTH} (64) — the Python
+ *     validator at `packages/tale_shared/.../org_slug.py` enforces
+ *     `{0,63}` (≤64 total). Allowing longer slugs here would mint
+ *     organizations that RAG/crawler refuse, locking those services
+ *     out for the org permanently.
  */
-export const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+export const MAX_ORG_SLUG_LENGTH = 64;
+export const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]{0,63}$/;
 
 /** Soft check — does NOT throw. Returns true for valid slugs. */
 export function isValidOrgSlug(slug: string): boolean {
-  return slug === 'default' || ORG_SLUG_REGEX.test(slug);
+  return slug.length <= MAX_ORG_SLUG_LENGTH && ORG_SLUG_REGEX.test(slug);
 }
 
 /** Hard check — throws `Error` with a uniform message on invalid input. */
 export function assertValidOrgSlug(slug: string): void {
   if (!isValidOrgSlug(slug)) {
     throw new Error(
-      `Invalid org slug "${slug}". Must match ${ORG_SLUG_REGEX.source}.`,
+      `Invalid org slug "${slug}". Must match ${ORG_SLUG_REGEX.source} ` +
+        `(max ${MAX_ORG_SLUG_LENGTH} chars).`,
     );
   }
 }

From a38244a6caa675c45c2c5ee6d88e5bc2fcc31fd9 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:47:26 +0800
Subject: [PATCH 21/41] fix(cli): migrate-config-layout now also reorganizes
 host project layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the migrate script ran only inside the convex container against
\$DATA — it never touched the operator's host project directory. But
\`tale start\` and \`tale deploy\` hard-fail when they detect legacy per-
domain dirs (agents/, workflows/, …) at the project root and point the
operator at \`tale migrate config-layout\`. Running the suggested fix
left those dirs in place; the operator would re-run \`tale start\` and
hit the same error — a deadlock with no documented escape.

New host-layout phase runs first (in the TypeScript wrapper, not the
docker'd bash):
- Detects each LEGACY_DOMAIN_DIR_NAMES dir at the project root.
- Atomically renames it to \`default/<dir>/\` (same-fs rename(2)).
- Refuses to overwrite a populated \`default/<dir>/\` — records a
  conflict the operator must resolve.
- Supports --dry-run.
- --cleanup-old skips the host phase (rename is destructive, no old to
  clean).

Also fixes the docs/<locale>/ placeholder in two error messages
(start.ts and migrate-config-layout.ts) — render docs/en/ literally
since these messages are operator-facing CLI output, not translated
copy.
---
 tools/cli/src/commands/migrate.ts             |   4 +-
 .../src/lib/actions/migrate-config-layout.ts  | 163 +++++++++++++++++-
 tools/cli/src/lib/actions/start.ts            |   2 +-
 3 files changed, 166 insertions(+), 3 deletions(-)

diff --git a/tools/cli/src/commands/migrate.ts b/tools/cli/src/commands/migrate.ts
index 6b1cb4175f..56d498a617 100644
--- a/tools/cli/src/commands/migrate.ts
+++ b/tools/cli/src/commands/migrate.ts
@@ -26,10 +26,12 @@ export function createMigrateCommand(): Command {
     )
     .action(async (opts: { dryRun?: boolean; cleanupOld?: boolean }) => {
       try {
-        await resolveProjectContext(requireProject());
+        const projectDir = requireProject();
+        await resolveProjectContext(projectDir);
         await migrateConfigLayout({
           dryRun: opts.dryRun ?? false,
           cleanupOld: opts.cleanupOld ?? false,
+          projectDir,
         });
       } catch (err) {
         logger.error(err instanceof Error ? err.message : String(err));
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
index fe5d02de64..c45a25735b 100644
--- a/tools/cli/src/lib/actions/migrate-config-layout.ts
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -17,14 +17,129 @@
  *      (sha-verifies new == old, then unlinks the olds)
  */
 
+import { existsSync } from 'node:fs';
+import { mkdir, rename, stat } from 'node:fs/promises';
+import { join } from 'node:path';
+
 import { getProjectId } from '../../utils/load-env';
 import * as logger from '../../utils/logger';
 import { exec } from '../docker/exec';
 import { isContainerRunning } from '../docker/is-container-running';
+import { LEGACY_DOMAIN_DIR_NAMES } from './deploy';
 
 interface MigrateConfigLayoutOptions {
   dryRun: boolean;
   cleanupOld: boolean;
+  /**
+   * Project root on the host. Used by the host-layout phase to relocate
+   * legacy per-domain dirs (`agents/`, `workflows/`, …) into
+   * `default/<dir>/`. Optional only because some callers may want to
+   * skip the host phase (none do today); when omitted we use the
+   * current working directory.
+   */
+  projectDir?: string;
+}
+
+interface HostLayoutResult {
+  /** Dirs already in the new location (no action). */
+  alreadyMigrated: string[];
+  /** Dirs the script moved (or would move in dry-run). */
+  migrated: string[];
+  /** Dirs that couldn't be moved because the destination already exists with content. */
+  conflicts: string[];
+}
+
+/**
+ * Relocate legacy host-side per-domain dirs into `<projectDir>/default/`.
+ *
+ * Background: `tale start` / `tale deploy` hard-fail when any of
+ * `LEGACY_DOMAIN_DIR_NAMES` sits at the project root, and point the
+ * operator at `tale migrate config-layout`. Previously the migrate
+ * script only touched the convex container's `$DATA` — operators
+ * following the runbook would re-run `tale start` and hit the same
+ * error (a deadlock). This phase fixes that by also reorganizing host
+ * files in lockstep.
+ *
+ * Safety:
+ *   - Atomic `rename(2)` within the same filesystem (project root +
+ *     `default/` are siblings, so no cross-device copy hazard).
+ *   - Refuses to overwrite an existing populated `default/<dir>/` —
+ *     records a conflict the operator must resolve.
+ *   - Dry-run prints the plan without writing.
+ *   - `--cleanup-old` is a no-op for the host phase (rename is
+ *     destructive — there is no "old" to clean up).
+ */
+async function migrateHostLayout(
+  projectDir: string,
+  options: { dryRun: boolean },
+): Promise<HostLayoutResult> {
+  const result: HostLayoutResult = {
+    alreadyMigrated: [],
+    migrated: [],
+    conflicts: [],
+  };
+  const defaultDir = join(projectDir, 'default');
+
+  for (const name of LEGACY_DOMAIN_DIR_NAMES) {
+    const src = join(projectDir, name);
+    const dst = join(defaultDir, name);
+    if (!existsSync(src)) continue;
+    if (existsSync(dst)) {
+      // Both old and new locations exist. We do not attempt a merge —
+      // the operator must reconcile (typically by inspecting `dst` and
+      // `rm -rf`-ing the stale side).
+      result.conflicts.push(
+        `${name}/: both ${src} and ${dst} exist; manual reconcile required`,
+      );
+      continue;
+    }
+    if (options.dryRun) {
+      result.migrated.push(name);
+      continue;
+    }
+    // Ensure the parent `default/` exists before the rename. Same-fs
+    // rename so this is atomic per POSIX.
+    await mkdir(defaultDir, { recursive: true });
+    try {
+      await rename(src, dst);
+      result.migrated.push(name);
+    } catch (err) {
+      // Cross-device rename (EXDEV) shouldn't happen because src and
+      // dst share a parent, but a stale dst inode could also surface
+      // EEXIST or EPERM here. Record + continue rather than abort the
+      // whole migration.
+      result.conflicts.push(
+        `${name}/: rename failed (${err instanceof Error ? err.message : String(err)})`,
+      );
+    }
+  }
+  // Detect a fully-already-migrated layout for diagnostics. We can't
+  // distinguish "operator never had legacy dirs" from "operator already
+  // migrated", so this is informational only.
+  for (const name of LEGACY_DOMAIN_DIR_NAMES) {
+    const dst = join(defaultDir, name);
+    if (existsSync(dst) && !existsSync(join(projectDir, name))) {
+      result.alreadyMigrated.push(name);
+    }
+  }
+  // Best-effort empty-defaultDir cleanup: if we created `default/` but
+  // every probe missed (rename failed, dry-run), don't leave an empty
+  // dir behind.
+  try {
+    if (
+      result.migrated.length === 0 &&
+      existsSync(defaultDir) &&
+      (await stat(defaultDir)).isDirectory()
+    ) {
+      // No-op — left intentionally; an empty `default/` is harmless and
+      // future scaffold ops will populate it.
+    }
+  } catch (err) {
+    logger.warn(
+      `Could not stat ${defaultDir}: ${err instanceof Error ? err.message : err}`,
+    );
+  }
+  return result;
 }
 
 /**
@@ -264,7 +379,53 @@ export async function migrateConfigLayout(
   options: MigrateConfigLayoutOptions,
 ): Promise<void> {
   const { dryRun, cleanupOld } = options;
+  const projectDir = options.projectDir ?? process.cwd();
+
+  // ---------------------------------------------------------------------------
+  // Phase 1 — host layout
+  //
+  // Container-side migration would leave the operator stuck if they don't
+  // also fix the host project layout (which `tale start` + `tale deploy`
+  // refuse to push). Run host first so the runbook in the error message
+  // from those two commands actually fixes what they detected.
+  // `--cleanup-old` skips the host phase: rename is destructive, there's
+  // no "old" to clean.
+  // ---------------------------------------------------------------------------
+  if (!cleanupOld) {
+    logger.blank();
+    logger.step(
+      dryRun
+        ? '[DRY-RUN] Host layout: would move legacy per-domain dirs into default/'
+        : 'Host layout: moving legacy per-domain dirs into default/...',
+    );
+    const hostResult = await migrateHostLayout(projectDir, { dryRun });
+    if (hostResult.migrated.length > 0) {
+      for (const name of hostResult.migrated) {
+        logger.info(
+          dryRun
+            ? `  HOST_PLAN: mv ${name}/ default/${name}/`
+            : `  OK: mv ${name}/ default/${name}/`,
+        );
+      }
+    } else if (hostResult.alreadyMigrated.length > 0) {
+      logger.info('  Host layout already migrated.');
+    } else {
+      logger.info('  No legacy host dirs found.');
+    }
+    if (hostResult.conflicts.length > 0) {
+      logger.warn(
+        `Host-layout conflicts (require manual reconciliation):\n  - ${hostResult.conflicts.join('\n  - ')}`,
+      );
+      throw new Error(
+        `tale migrate config-layout: ${hostResult.conflicts.length} host-layout conflict(s); ` +
+          'resolve them and re-run. See docs/en/self-hosted/operate/upgrades.md.',
+      );
+    }
+  }
 
+  // ---------------------------------------------------------------------------
+  // Phase 2 — container layout (providers/*.secrets.json under $DATA)
+  // ---------------------------------------------------------------------------
   const containerName = `${getProjectId()}-convex`;
   if (!(await isContainerRunning(containerName))) {
     // Earlier the message said "e.g. `tale deploy`", but `tale deploy`
@@ -276,7 +437,7 @@ export async function migrateConfigLayout(
       `Convex container "${containerName}" is not running. ` +
         'Start the OLD platform first (`tale start` or `docker compose start convex`) ' +
         'so the migrate script can run against the still-mounted volume, then re-run ' +
-        '`tale migrate config-layout`. See docs/<locale>/self-hosted/operate/upgrades.md ' +
+        '`tale migrate config-layout`. See docs/en/self-hosted/operate/upgrades.md ' +
         'for the full migrate → deploy → cleanup runbook.',
     );
   }
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index 97b35cb38e..fa81189a31 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -165,7 +165,7 @@ export async function start(options: StartOptions): Promise<void> {
         .join(', ')}\n` +
         '  The org-first layout expects these under `default/<domain>/` (or another org subtree).\n' +
         '  Migrate with: `tale migrate config-layout` then `tale deploy --override-all -y`.\n' +
-        '  See docs/<locale>/self-hosted/operate/upgrades.md for the full runbook.',
+        '  See docs/en/self-hosted/operate/upgrades.md for the full runbook.',
     );
   }
 

From 025fde81956440eb00f393aec6f54f363d5f1740 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:48:45 +0800
Subject: [PATCH 22/41] fix(cli): admin-key redactor leaks pipe-delimited
 Convex payload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADMIN_KEY_RE's value charset was [A-Za-z0-9+/=._-] — it stopped at the
first `|`. Self-hosted Convex admin keys are formatted
`<INSTANCE_NAME>|<base64-payload>` (Convex generate_key contract). When
reseed-all-orgs hit the failure or unparseable-success log paths, the
redactor truncated the key at the pipe and emitted
`Admin Key: <redacted>|<actual-secret-payload>` to operator stderr/CI.

Add `|` to the charset. New test asserts the full pipe-delimited shape
collapses to `Admin Key: <redacted>` with no payload survival.
---
 tools/cli/src/lib/actions/reseed-all-orgs.test.ts | 13 +++++++++++++
 tools/cli/src/lib/actions/reseed-all-orgs.ts      | 11 ++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.test.ts b/tools/cli/src/lib/actions/reseed-all-orgs.test.ts
index 6b57fdeb53..4f99d0648d 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.test.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.test.ts
@@ -50,4 +50,17 @@ describe('redactAdminKey', () => {
     const input = 'Admin Key: TBD';
     expect(redactAdminKey(input)).toBe('Admin Key: TBD');
   });
+
+  test('redacts the full pipe-delimited self-hosted Convex admin key', () => {
+    // Convex self-hosted generate_key emits `<INSTANCE_NAME>|<base64-payload>`.
+    // Previously the charset excluded `|`, so the regex stopped at the
+    // first pipe and left the secret payload exposed.
+    const payload = 'aGVsbG8gd29ybGQK==';
+    const input = `Admin Key: tale_platform|01abcdef-secret-${payload}`;
+    const out = redactAdminKey(input);
+    expect(out).toBe('Admin Key: <redacted>');
+    expect(out).not.toContain('tale_platform');
+    expect(out).not.toContain(payload);
+    expect(out).not.toContain('01abcdef');
+  });
 });
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index 37805b7bc0..48b89eb1fb 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -85,10 +85,15 @@ HOME=/home/app timeout ${RESEED_TIMEOUT_S} bunx convex run \\
  * If anything upstream (env.sh's diagnostic mode, a future Convex CLI
  * banner, etc.) prints an admin-key line that slips past the bash grep,
  * this regex strips it before the value reaches the logger. Case-
- * insensitive, anchors on any leading whitespace, and is intentionally
- * conservative on the value charset (admin keys are base64/hex-like).
+ * insensitive, anchors on any leading whitespace.
+ *
+ * Charset includes `|` because self-hosted Convex admin keys are
+ * formatted `<INSTANCE_NAME>|<base64-payload>` (e.g.
+ * `tale_platform|01abc...`). Without the pipe the regex matched only
+ * up to the first `|`, leaving the secret payload after it in the
+ * logged stream (round-3 P1-adjacent secret leak).
  */
-const ADMIN_KEY_RE = /\b([Aa]dmin\s+[Kk]ey)\s*:?\s*[A-Za-z0-9+/=._-]{12,}/g;
+const ADMIN_KEY_RE = /\b([Aa]dmin\s+[Kk]ey)\s*:?\s*[A-Za-z0-9+/=._\-|]{12,}/g;
 
 export function redactAdminKey(text: string): string {
   return text.replace(ADMIN_KEY_RE, '$1: <redacted>');

From 38c47cf5c62156d436084b474447f9b6158e26de Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:54:15 +0800
Subject: [PATCH 23/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20auth,=20http,=20lib,=20cascade,=20errors?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- assertValidOrgSlug now wraps in APIError('BAD_REQUEST') at both
  Better Auth hooks so invalid input returns 400 instead of 500.
- orgSlugFromId carries a stronger doc warning: it does NOT verify
  membership; callers must pre-verify or trust the source. Rename
  deferred (185+ call sites — too risky for this PR).
- /api/sse/auth comment corrected (256 is a soft cap, not a hard
  limit) and we now warn when truncation actually hits.
- ragFetch sets x-tale-org from the trimmed slug so accidental
  whitespace doesn't ride into RAG's filesystem lookup. JSDoc
  corrected: content + compare-by-id + compare-files all require
  org_slug per the RAG router (verified against documents.py:475,518,564
  and search.py).
- generateAgentResponse wraps orgSlugFromId in try/catch; a transient
  lookup miss degrades to "skip knowledge context" instead of aborting
  the whole response (matches the guardrails-resolve pattern).
- cascade_helpers resolves orgSlug BEFORE the storage.delete loop.
  Previously a slug-lookup failure landed AFTER blobs were already
  out-of-band deleted, leaking RAG chunks with no purge scheduled.
- parseRetryAfterMs caps at 24h (MAX_RETRY_AFTER_MS); a malicious /
  buggy upstream sending '1e10' or a far-future HTTP date would
  otherwise pin scheduler backoff at ~317 years. New tests cover
  scientific-notation, oversized-seconds, far-future-date branches.
---
 services/platform/convex/auth.ts              | 19 ++++++++-
 services/platform/convex/http.ts              | 13 +++++-
 .../lib/agent_response/generate_response.ts   | 41 +++++++++++++------
 .../__tests__/upstream_http_error.test.ts     | 16 ++++++++
 .../convex/lib/errors/upstream_http_error.ts  | 17 ++++++--
 .../platform/convex/lib/helpers/org_slug.ts   | 16 +++++++-
 .../platform/convex/lib/helpers/rag_config.ts | 22 ++++++----
 .../convex/threads/cascade_helpers.ts         | 23 +++++++++--
 8 files changed, 134 insertions(+), 33 deletions(-)

diff --git a/services/platform/convex/auth.ts b/services/platform/convex/auth.ts
index 542592128b..92658ef6fc 100644
--- a/services/platform/convex/auth.ts
+++ b/services/platform/convex/auth.ts
@@ -586,7 +586,16 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
             // Reject anything that doesn't fit the canonical slug shape
             // so users can't smuggle invalid filesystem characters or
             // length-cap-busting strings past the auth boundary.
-            assertValidOrgSlug(normalizedSlug);
+            // assertValidOrgSlug throws plain Error; wrap as
+            // APIError('BAD_REQUEST') so Better Auth surfaces 400 to the
+            // client rather than 500 (round-3 P2 R1-P2-a).
+            try {
+              assertValidOrgSlug(normalizedSlug);
+            } catch (err) {
+              throw new APIError('BAD_REQUEST', {
+                message: err instanceof Error ? err.message : String(err),
+              });
+            }
 
             // Refuse reserved slugs ("default") that the platform pins
             // global resources to (branding, retention defaults).
@@ -646,7 +655,13 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
             const rawSlug = orgPatch.slug;
             if (typeof rawSlug !== 'string') return;
             const normalizedSlug = rawSlug.toLowerCase();
-            assertValidOrgSlug(normalizedSlug);
+            try {
+              assertValidOrgSlug(normalizedSlug);
+            } catch (err) {
+              throw new APIError('BAD_REQUEST', {
+                message: err instanceof Error ? err.message : String(err),
+              });
+            }
             if (isReservedOrgSlug(normalizedSlug)) {
               throw new APIError('BAD_REQUEST', {
                 message: `Organization slug "${normalizedSlug}" is reserved by the platform.`,
diff --git a/services/platform/convex/http.ts b/services/platform/convex/http.ts
index a0b8b7ee7d..32175d2c5a 100644
--- a/services/platform/convex/http.ts
+++ b/services/platform/convex/http.ts
@@ -334,7 +334,10 @@ http.route({
       components.betterAuth.adapter.findMany,
       {
         model: 'member',
-        // Cap matches the platform's hard limit on per-user org membership.
+        // 256 is a soft cap — there is no hard platform-side limit on
+        // per-user memberships. Anyone with >256 active memberships is
+        // an operator / service account, not a regular subject. We log
+        // when we hit the cap below so silent truncation is observable.
         paginationOpts: { cursor: null, numItems: 256 },
         where: [{ field: 'userId', value: session.user.id, operator: 'eq' }],
       },
@@ -343,6 +346,14 @@ http.route({
     const memberRows: unknown[] = Array.isArray(memberships?.page)
       ? memberships.page
       : [];
+    if (memberRows.length === 256) {
+      // Surface the soft-cap truncation so an operator with >256 memberships
+      // notices instead of silently losing SSE coverage for the excess orgs.
+      console.warn(
+        '[/api/sse/auth] hit 256-membership soft cap for user; some orgs may be silently truncated',
+        { userId: session.user.id },
+      );
+    }
     // Drop rows where the user is soft-removed via `role = 'disabled'`
     // (matches the canonical filter in lib/rls/organization/get_user_organizations.ts).
     // Without this filter, a disabled member keeps receiving SSE file events
diff --git a/services/platform/convex/lib/agent_response/generate_response.ts b/services/platform/convex/lib/agent_response/generate_response.ts
index 9c85eec377..aba1ee9a5b 100644
--- a/services/platform/convex/lib/agent_response/generate_response.ts
+++ b/services/platform/convex/lib/agent_response/generate_response.ts
@@ -704,19 +704,34 @@ export async function generateAgentResponse(
       if (accessibleFileIds.length === 0) {
         debugLog('No accessible RAG documents, skipping knowledge context');
       } else {
-        const orgSlug = await orgSlugFromId(ctx, organizationId);
-        knowledgeContextPromise = queryRagContext(
-          promptMessage,
-          undefined,
-          undefined,
-          undefined,
-          undefined,
-          { fileIds: accessibleFileIds, orgSlug },
-        );
-        debugLog('Knowledge context query started', {
-          threadId,
-          elapsedMs: Date.now() - startTime,
-        });
+        // Resolve slug defensively: a transient lookup miss (org row
+        // deleted between membership check and here, replica skew) should
+        // degrade gracefully — skip knowledge context — rather than abort
+        // the entire response generation. Matches the guardrails-resolve
+        // pattern lower in this file.
+        let orgSlug: string | undefined;
+        try {
+          orgSlug = await orgSlugFromId(ctx, organizationId);
+        } catch (err) {
+          console.warn(
+            '[generateAgentResponse] orgSlugFromId failed; skipping knowledge context',
+            err instanceof Error ? err.message : err,
+          );
+        }
+        if (orgSlug) {
+          knowledgeContextPromise = queryRagContext(
+            promptMessage,
+            undefined,
+            undefined,
+            undefined,
+            undefined,
+            { fileIds: accessibleFileIds, orgSlug },
+          );
+          debugLog('Knowledge context query started', {
+            threadId,
+            elapsedMs: Date.now() - startTime,
+          });
+        }
       }
     }
 
diff --git a/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
index c805781c37..9f219ce9f0 100644
--- a/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
+++ b/services/platform/convex/lib/errors/__tests__/upstream_http_error.test.ts
@@ -166,6 +166,22 @@ describe('UpstreamHttpError', () => {
     expect(err.retryAfterMs).toBe(30000);
   });
 
+  it('caps Retry-After at 24h to defend against absurd upstream values', () => {
+    const ONE_DAY_MS = 24 * 60 * 60 * 1000;
+    // Scientific-notation finite value: 1e10 seconds ≈ 317 years.
+    expect(parseRetryAfterMs('1e10')).toBe(ONE_DAY_MS);
+    // Plain too-large seconds.
+    expect(parseRetryAfterMs('999999999')).toBe(ONE_DAY_MS);
+    // Far-future HTTP date.
+    const farFuture = new Date(
+      Date.now() + 1000 * 60 * 60 * 24 * 365,
+    ).toUTCString();
+    expect(parseRetryAfterMs(farFuture)).toBe(ONE_DAY_MS);
+    // Past HTTP date still clamps to 0 (unchanged).
+    const past = new Date(Date.now() - 60_000).toUTCString();
+    expect(parseRetryAfterMs(past)).toBe(0);
+  });
+
   it('defaults endpoint to response.url when caller omits it', () => {
     const err = UpstreamHttpError.fromResponse(
       'rag',
diff --git a/services/platform/convex/lib/errors/upstream_http_error.ts b/services/platform/convex/lib/errors/upstream_http_error.ts
index 7cb3e2b9f9..e4dc3312b3 100644
--- a/services/platform/convex/lib/errors/upstream_http_error.ts
+++ b/services/platform/convex/lib/errors/upstream_http_error.ts
@@ -43,11 +43,21 @@ export function isRetryableStatus(status: number): boolean {
   return status === 408 || status === 429 || (status >= 500 && status < 600);
 }
 
+/**
+ * Cap parsed Retry-After delays to a sane ceiling so a malicious /
+ * misconfigured upstream cannot pin our scheduler backoff at an absurd
+ * future. `Number('1e10')` is a finite non-negative number and would
+ * otherwise produce ~317 years of delay; capping at 24h keeps both
+ * "seconds" and "HTTP-date" branches bounded.
+ */
+export const MAX_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
+
 /**
  * Parse the upstream `Retry-After` header into milliseconds. Supports
  * both the integer-seconds and HTTP-date forms per RFC 9110 §10.2.3.
  * Returns `undefined` when the header is missing or unparseable so
- * callers can fall back to a default backoff.
+ * callers can fall back to a default backoff. Capped at
+ * {@link MAX_RETRY_AFTER_MS}.
  */
 export function parseRetryAfterMs(value: string | null): number | undefined {
   if (!value) return undefined;
@@ -55,12 +65,13 @@ export function parseRetryAfterMs(value: string | null): number | undefined {
   if (!trimmed) return undefined;
   const asInt = Number(trimmed);
   if (Number.isFinite(asInt) && asInt >= 0) {
-    return Math.round(asInt * 1000);
+    return Math.min(Math.round(asInt * 1000), MAX_RETRY_AFTER_MS);
   }
   const asDate = Date.parse(trimmed);
   if (!Number.isNaN(asDate)) {
     const delta = asDate - Date.now();
-    return delta > 0 ? delta : 0;
+    if (delta <= 0) return 0;
+    return Math.min(delta, MAX_RETRY_AFTER_MS);
   }
   return undefined;
 }
diff --git a/services/platform/convex/lib/helpers/org_slug.ts b/services/platform/convex/lib/helpers/org_slug.ts
index 61f8366274..22f679c7fd 100644
--- a/services/platform/convex/lib/helpers/org_slug.ts
+++ b/services/platform/convex/lib/helpers/org_slug.ts
@@ -22,8 +22,20 @@ type CtxWithRunQuery = {
 /**
  * Resolve an organizationId to its slug via Better Auth.
  *
- * Throws if no matching org row exists — callers should ensure the
- * organizationId came from a verified-membership check upstream.
+ * **This helper does NOT verify caller membership.** It is purely an
+ * id → slug lookup that succeeds for any organization row that exists.
+ * Callers must ensure `organizationId` came from a verified-membership
+ * check upstream (e.g. `requireOrgMembership`, `requireOrgMembershipById`,
+ * `getOrganizationMember`, or a server-side context whose
+ * `organizationId` is trusted by construction).
+ *
+ * **Never** call this with an `organizationId` taken directly from
+ * a request body / argument without first verifying membership — that
+ * would let a member of org A pass org B's id and silently obtain
+ * org B's slug, then use it as the `X-Tale-Org` header on a downstream
+ * RAG/crawler call.
+ *
+ * Throws if no matching org row exists, or if the row has no slug.
  */
 export async function orgSlugFromId(
   ctx: CtxWithRunQuery,
diff --git a/services/platform/convex/lib/helpers/rag_config.ts b/services/platform/convex/lib/helpers/rag_config.ts
index f2838f8033..116d9d72e8 100644
--- a/services/platform/convex/lib/helpers/rag_config.ts
+++ b/services/platform/convex/lib/helpers/rag_config.ts
@@ -199,12 +199,15 @@ export function _resetRagConfigForTests(): void {
  * Sets `Authorization: Bearer ${authToken}` when `RAG_AUTH_TOKEN` is
  * configured; otherwise sends no Authorization header (RAG runs open).
  *
- * `orgSlug` is required for endpoints whose service-side handler reads
- * the org's provider catalog (search, generate, upload, compare-files).
- * The RAG service enforces this via per-router `Depends(require_org_slug)`,
- * so callers MUST pass `orgSlug` for those endpoints — a missing header
- * yields 400 from RAG. Status / delete / content / compare-by-id
- * endpoints are org-agnostic and accept calls without the header.
+ * `orgSlug` is required for ALL endpoints whose service-side handler
+ * scopes by tenant. Verified against the routers at
+ * `services/rag/app/routers/documents.py` and `.../search.py`: every
+ * non-health endpoint declares `org_slug: str = Depends(require_org_slug)`
+ * — including `/documents/{file_id}/content` (read), `/documents/compare`
+ * (compare-by-id), `/documents/compare-files` (compare bytes), `/search`,
+ * `/documents/upload`, `/documents/{file_id}` DELETE. Only `/health` /
+ * `/` accept calls without the header. Callers MUST pass `orgSlug` for
+ * every other route — a missing header yields 400 from RAG.
  *
  * When `orgSlug` is supplied, it sets `X-Tale-Org: ${orgSlug}` and
  * cannot be overridden via a header in `init.headers` — preventing
@@ -256,12 +259,15 @@ export async function ragFetch(
   // the field entirely" (the org-agnostic endpoint path). Earlier the
   // truthy check folded both into the same silent-omit branch.
   if (init.orgSlug !== undefined) {
-    if (!init.orgSlug.trim()) {
+    const trimmedSlug = init.orgSlug.trim();
+    if (!trimmedSlug) {
       throw new Error(
         'ragFetch: orgSlug was provided but is empty; refusing to call RAG without a valid X-Tale-Org header',
       );
     }
-    headers.set('x-tale-org', init.orgSlug);
+    // Send the trimmed value (not init.orgSlug) so accidental whitespace
+    // doesn't ride into RAG's `<orgSlug>/...` filesystem lookup.
+    headers.set('x-tale-org', trimmedSlug);
   }
 
   const timeoutMs = init.timeoutMs ?? 10_000;
diff --git a/services/platform/convex/threads/cascade_helpers.ts b/services/platform/convex/threads/cascade_helpers.ts
index 101c4ae731..672fafbc8e 100644
--- a/services/platform/convex/threads/cascade_helpers.ts
+++ b/services/platform/convex/threads/cascade_helpers.ts
@@ -317,6 +317,25 @@ export async function cascadeDeleteThreadChildren(
         q.eq('organizationId', organizationId).eq('threadId', threadId),
       )
       .take(PAGE_SIZE);
+    // Resolve slug BEFORE the delete loop. Previously the lookup ran
+    // after every storage.delete + db.delete had committed; if it threw
+    // (org row deleted mid-cascade, replica skew), the DB tx rolls back
+    // so fileMetadata rows reappear, but `ctx.storage.delete` is out-
+    // of-band and NOT rolled back — the blob is gone AND no RAG purge
+    // was scheduled. Resolve first so a slug-lookup failure aborts the
+    // loop before any destructive op runs.
+    let orgSlug: string;
+    try {
+      orgSlug = await orgSlugFromId(ctx, organizationId);
+    } catch (error) {
+      console.warn(
+        `[cascadeDeleteThreadChildren] orgSlugFromId failed for ${organizationId}; deferring file cascade:`,
+        error instanceof Error ? error.message : error,
+      );
+      // Signal "not done" so the caller retries. The fileMetadata page
+      // is still present so a re-run will find it.
+      return { done: false, remaining: 1 };
+    }
     const ragPurgeStorageIds: string[] = [];
     for (const fileMeta of filesPage) {
       try {
@@ -332,10 +351,6 @@ export async function cascadeDeleteThreadChildren(
       await ctx.db.delete(fileMeta._id);
     }
     if (ragPurgeStorageIds.length > 0) {
-      // `organizationId` is guaranteed truthy at this point (outer
-      // `if (organizationId)` branch). Resolve to slug so RAG's per-org
-      // delete scope targets the correct tenant's chunks.
-      const orgSlug = await orgSlugFromId(ctx, organizationId);
       await ctx.scheduler.runAfter(
         0,
         internal.workflow_engine.action_defs.rag.helpers.delete_document

From 7722068e1b3add4765ccaeceac98268946ee5908 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 01:57:31 +0800
Subject: [PATCH 24/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20file=5Fmetadata,=20branding,=20documents?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- acquireTranscriptionLock rejects rows already in 'completed' status.
  Without this guard, a late-arriving duplicate transcribeAudio schedule
  re-bills Whisper and re-writes the transcript on a row whose previous
  run already succeeded. The single chokepoint is the right place to
  enforce; entry-point pre-checks remain as belt-and-braces.

- extractFileMetadata stamps a terminal marker (visionRequired: false +
  scannedPagesDetected: 0) on permanent failure. Previously, the catch
  logged and returned, leaving visionRequired undefined forever — the
  UI's "still extracting" state had no exit when extraction failed on
  a 4xx, malformed response, or org-resolve failure.

- serve-branding-images parses req.url through new URL(...) so query
  strings (?v=2 cache-busters, etc.) are dropped before filename
  validation. Without this, dev silently 404'd on any image URL with a
  query while prod's c.req.param handler worked fine. Also adds the
  imagesDir + sep defense-in-depth prefix check to match server.ts.

- reindexDocumentInRag now accepts oldOrganizationId and uses it to
  scope the old-RAG delete BEFORE the early-return on missing document.
  Previously, if the document row was deleted/cleared between scheduling
  and execution, the early-return skipped the delete and orphaned the
  oldFileId chunks in RAG forever. updateDocumentInternal passes the
  current document's organizationId at schedule time so the delete-org
  context survives any later doc state changes.
---
 .../convex/documents/internal_actions.ts      | 51 ++++++++++++-------
 .../documents/update_document_internal.ts     | 12 ++++-
 .../convex/file_metadata/internal_actions.ts  | 23 +++++++++
 .../file_metadata/internal_mutations.ts       |  6 +++
 .../vite-plugins/serve-branding-images.ts     | 16 ++++--
 5 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/services/platform/convex/documents/internal_actions.ts b/services/platform/convex/documents/internal_actions.ts
index 8d03ea6895..cfe92b2a16 100644
--- a/services/platform/convex/documents/internal_actions.ts
+++ b/services/platform/convex/documents/internal_actions.ts
@@ -537,38 +537,55 @@ export const reindexDocumentInRag = internalAction({
   args: {
     documentId: v.id('documents'),
     oldFileId: v.id('_storage'),
+    /** Optional for backward compatibility with in-flight scheduled jobs.
+     * New scheduler callers always pass it; when missing we fall back
+     * to the current document's organizationId (which may have changed
+     * or been deleted — best-effort). */
+    oldOrganizationId: v.optional(v.string()),
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
-    // Look up current document first so we can scope the delete by org.
+    // Look up current document so we can also schedule the new-upload
+    // step, but DON'T let a missing document skip the old-RAG delete —
+    // that would orphan oldFileId chunks. Resolve the delete org-scope
+    // from `oldOrganizationId` (preferred — captured by the scheduler
+    // caller at update time) and fall back to the current document only
+    // when missing.
     const document = await ctx.runQuery(
       internal.documents.internal_queries.getDocumentByIdRaw,
       { documentId: args.documentId },
     );
 
-    if (!document || !document.fileId) {
-      return null;
-    }
-
-    // Delete old RAG entry (ignore 404 — may not have been indexed)
-    try {
-      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
-      const response = await ragFetch(
-        `/api/v1/documents/${encodeURIComponent(args.oldFileId)}`,
-        { method: 'DELETE', timeoutMs: 60_000, orgSlug },
-      );
-      if (!response.ok && response.status !== 404) {
+    const deleteOrgId =
+      args.oldOrganizationId ?? document?.organizationId ?? null;
+    if (deleteOrgId) {
+      try {
+        const orgSlug = await orgSlugFromId(ctx, deleteOrgId);
+        const response = await ragFetch(
+          `/api/v1/documents/${encodeURIComponent(args.oldFileId)}`,
+          { method: 'DELETE', timeoutMs: 60_000, orgSlug },
+        );
+        if (!response.ok && response.status !== 404) {
+          console.warn(
+            `[reindexDocumentInRag] Failed to delete old RAG entry ${args.oldFileId}: ${response.status}`,
+          );
+        }
+      } catch (error) {
         console.warn(
-          `[reindexDocumentInRag] Failed to delete old RAG entry ${args.oldFileId}: ${response.status}`,
+          `[reindexDocumentInRag] Error deleting old RAG entry ${args.oldFileId}:`,
+          error,
         );
       }
-    } catch (error) {
+    } else {
       console.warn(
-        `[reindexDocumentInRag] Error deleting old RAG entry ${args.oldFileId}:`,
-        error,
+        `[reindexDocumentInRag] No org context for old RAG delete; oldFileId ${args.oldFileId} may leak chunks (documentId=${args.documentId})`,
       );
     }
 
+    if (!document || !document.fileId) {
+      return null;
+    }
+
     // Upload new file to RAG
     try {
       const rawResult = await ragAction.execute(
diff --git a/services/platform/convex/documents/update_document_internal.ts b/services/platform/convex/documents/update_document_internal.ts
index a7d0a1eddc..9e4f2544c4 100644
--- a/services/platform/convex/documents/update_document_internal.ts
+++ b/services/platform/convex/documents/update_document_internal.ts
@@ -96,12 +96,20 @@ export async function updateDocumentInternal(
     await ctx.db.patch(documentId, cleanUpdateData);
   }
 
-  // Schedule RAG re-index after the patch
+  // Schedule RAG re-index after the patch. Pass `organizationId`
+  // explicitly so the action can purge the *old* RAG entry even if
+  // the document row is later deleted/cleared before the scheduled
+  // job fires — otherwise the orphan oldFileId chunks survive
+  // forever (round-3 P2 R4-P2-a).
   if (needsReindex && oldFileId) {
     await ctx.scheduler.runAfter(
       0,
       internal.documents.internal_actions.reindexDocumentInRag,
-      { documentId, oldFileId },
+      {
+        documentId,
+        oldFileId,
+        oldOrganizationId: document.organizationId,
+      },
     );
   }
 }
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index ac2db60a55..6ad89b2d12 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -213,6 +213,29 @@ export const extractFileMetadata = internalAction({
           console.warn(
             `[extractFileMetadata] Permanent failure for file ${args.storageId}; not retrying: ${message}`,
           );
+          // Stamp a terminal marker so downstream consumers exit the
+          // "still extracting" state. Without this, visionRequired
+          // stayed undefined forever on a permanent failure and the
+          // UI / scannedPagesDetected gating couldn't distinguish
+          // "extraction pending" from "extraction failed". We treat
+          // permanent failure as "no vision needed" — RAG will still
+          // pick up the file via the other ingest path.
+          try {
+            await ctx.runMutation(
+              internal.file_metadata.internal_mutations
+                .updateFileVisionMetadata,
+              {
+                storageId: args.storageId,
+                scannedPagesDetected: 0,
+                visionRequired: false,
+              },
+            );
+          } catch (markerErr) {
+            console.warn(
+              `[extractFileMetadata] Failed to stamp permanent-failure marker for ${args.storageId}:`,
+              markerErr instanceof Error ? markerErr.message : markerErr,
+            );
+          }
         } else if (attempt < EXTRACT_METADATA_RETRY_DELAYS.length) {
           const retryDelay = EXTRACT_METADATA_RETRY_DELAYS[attempt];
           await ctx.scheduler.runAfter(
diff --git a/services/platform/convex/file_metadata/internal_mutations.ts b/services/platform/convex/file_metadata/internal_mutations.ts
index 46b94f1fc9..c885552820 100644
--- a/services/platform/convex/file_metadata/internal_mutations.ts
+++ b/services/platform/convex/file_metadata/internal_mutations.ts
@@ -408,6 +408,12 @@ export const acquireTranscriptionLock = internalMutation({
       row.transcriptionLeaseExpiresAt > now &&
       row.transcriptionStatus === 'running';
     if (leaseHeld) return null;
+    // Defense-in-depth (round-3 P2): a row in `completed` should not be
+    // re-acquired by a late-arriving duplicate `transcribeAudio` schedule
+    // — re-running Whisper would re-bill the org and re-write the
+    // transcript / re-index RAG. The entry points pre-check today, but
+    // the lock is the single chokepoint and is the right place to enforce.
+    if (row.transcriptionStatus === 'completed') return null;
 
     await ctx.db.patch(row._id, {
       transcriptionStatus: 'running',
diff --git a/services/platform/vite-plugins/serve-branding-images.ts b/services/platform/vite-plugins/serve-branding-images.ts
index bbc6c7d9a7..127ff2fe5f 100644
--- a/services/platform/vite-plugins/serve-branding-images.ts
+++ b/services/platform/vite-plugins/serve-branding-images.ts
@@ -1,6 +1,6 @@
 import { existsSync } from 'node:fs';
 import { readFile } from 'node:fs/promises';
-import { join, resolve } from 'node:path';
+import { join, resolve, sep } from 'node:path';
 
 import { type Plugin } from 'vite';
 
@@ -31,14 +31,24 @@ export function serveBrandingImages(): Plugin {
           return;
         }
 
-        const filename = req.url.slice('/branding/images/'.length);
+        // Parse via URL so query strings (e.g. ?v=2 cache-busters)
+        // and fragments are dropped before filename validation. Without
+        // this, /branding/images/logo.png?v=2 became filename
+        // 'logo.png?v=2' which then failed existsSync and 404'd in
+        // dev — silently diverging from the prod handler that uses
+        // c.req.param('filename') (round-3 P2 R3-P2-a).
+        const url = new URL(req.url, 'http://x');
+        const filename = url.pathname.slice('/branding/images/'.length);
         if (!filename || filename.includes('/') || filename.includes('..')) {
           next();
           return;
         }
 
         const filePath = resolve(imagesDir, filename);
-        if (!filePath.startsWith(imagesDir) || !existsSync(filePath)) {
+        // `+ sep` defense-in-depth so a future sibling dir whose name
+        // is a string prefix of imagesDir (e.g. `imagesXYZ/`) can't be
+        // matched by raw startsWith if the filename filter ever loosens.
+        if (!filePath.startsWith(imagesDir + sep) || !existsSync(filePath)) {
           next();
           return;
         }

From 5070ea44c8b3df4439e5904647489f2fa7fad804 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:00:29 +0800
Subject: [PATCH 25/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20agent=5Ftools=20rag=20and=20web=20error=20paths?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- queryRagContext now hoists the orgSlug check OUTSIDE the outer
  try/catch so an empty/blank slug throws cleanly instead of being
  silently swallowed by the graceful-degrade catch (returning
  undefined as if RAG had no results). The JSDoc said the failure
  was surfaced; runtime now matches.
- search_pages explicitly rejects malformed `args.domain` rather than
  silently dropping the filter and running a global org search — the
  LLM thinks its filter applied and the user gets unrelated hits.
- search_pages wraps fetchSearch in try/catch and degrades to a
  "search temporarily unavailable" reply on crawler failure, matching
  the sibling fetch_and_extract helper's {success:false} contract.
---
 .../agent_tools/rag/query_rag_context.ts      | 23 +++++++--
 .../agent_tools/web/helpers/search_pages.ts   | 48 ++++++++++++++-----
 2 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/services/platform/convex/agent_tools/rag/query_rag_context.ts b/services/platform/convex/agent_tools/rag/query_rag_context.ts
index 23dd44321e..ccd80b9bc5 100644
--- a/services/platform/convex/agent_tools/rag/query_rag_context.ts
+++ b/services/platform/convex/agent_tools/rag/query_rag_context.ts
@@ -158,12 +158,27 @@ export async function queryRagContext(
   similarityThreshold: number = DEFAULT_SIMILARITY_THRESHOLD,
   signal?: AbortSignal,
   recentMessages?: RecentMessage[],
-  // Required: callers must always pass `orgSlug` (and usually fileIds).
-  // Previously this was `options?: RagContextOptions`, which made the
-  // declared-required `orgSlug` field reachable as `undefined` at
-  // runtime — a type-vs-runtime mismatch that this signature fixes.
+  // The type says orgSlug is required, but TS forces this parameter to
+  // be optional because all preceding params have defaults. The runtime
+  // assertion below (outside the outer try/catch) enforces it loudly.
   options: RagContextOptions = { orgSlug: '' },
 ): Promise<RagContextResult | undefined> {
+  // Validate orgSlug up front, OUTSIDE the outer try/catch so the bug
+  // surfaces as a real throw instead of being silently swallowed by the
+  // graceful-degrade catch at the bottom of this function. A missing /
+  // blank slug is a caller misconfiguration, not a runtime RAG outage.
+  // (Round-3 P2 R7-P2-a — previously the empty-slug case hit ragFetch's
+  // throw, fell into the catch, and returned undefined as if the search
+  // had failed.)
+  if (
+    !options ||
+    typeof options.orgSlug !== 'string' ||
+    !options.orgSlug.trim()
+  ) {
+    throw new Error(
+      'queryRagContext: options.orgSlug is required and must be non-empty',
+    );
+  }
   try {
     const ragServiceUrl = getRagConfig().serviceUrl;
 
diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
index 78ba77e891..88ef56b214 100644
--- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts
+++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
@@ -97,6 +97,17 @@ export async function searchPages(
 ): Promise<SearchPagesResult> {
   let validDomain: string | undefined;
 
+  // When the agent supplies a `domain` that doesn't pass the shape
+  // check, surface it explicitly rather than silently dropping the
+  // filter and running a global search — the LLM thinks its filter
+  // applied and the user gets unrelated cross-domain hits.
+  if (args.domain && !isValidDomain(args.domain) && ctx.organizationId) {
+    return {
+      text: `The domain "${args.domain}" doesn't look valid. Use a bare domain like "example.com" (no protocol, no path), or omit the filter to search every website in your knowledge base.`,
+      citations: [],
+    };
+  }
+
   if (args.domain && isValidDomain(args.domain) && ctx.organizationId) {
     const website = await ctx.runQuery(
       internal.websites.internal_queries.getWebsiteByDomain,
@@ -130,20 +141,35 @@ export async function searchPages(
     throw new Error('search_pages requires organizationId in ToolCtx.');
   }
   const orgSlug = await orgSlugFromId(ctx, ctx.organizationId);
-  let data = await fetchSearch(crawlerUrl, orgSlug, args.query, validDomain);
-  let results = data.results;
 
-  // Fallback to global search if domain-scoped search returns no results
+  // Wrap crawler calls so a network blip / 5xx becomes a graceful
+  // "search is unavailable" reply to the agent rather than an
+  // unhandled exception — matches fetchAndExtract's contract in the
+  // sibling helper (round-3 P2 R8-P2-a).
+  let data: Awaited<ReturnType<typeof fetchSearch>>;
   let domainFallback = false;
-  if ((!results || results.length === 0) && validDomain) {
-    debugLog('web:search_pages domain fallback', {
-      query: args.query,
-      domain: validDomain,
-    });
-    data = await fetchSearch(crawlerUrl, orgSlug, args.query);
-    results = data.results;
-    domainFallback = true;
+  try {
+    data = await fetchSearch(crawlerUrl, orgSlug, args.query, validDomain);
+    // Fallback to global search if domain-scoped search returns no results
+    if ((!data.results || data.results.length === 0) && validDomain) {
+      debugLog('web:search_pages domain fallback', {
+        query: args.query,
+        domain: validDomain,
+      });
+      data = await fetchSearch(crawlerUrl, orgSlug, args.query);
+      domainFallback = true;
+    }
+  } catch (err) {
+    console.warn(
+      '[web:search_pages] crawler fetchSearch failed',
+      err instanceof Error ? err.message : err,
+    );
+    return {
+      text: 'The web-search service is temporarily unavailable. Try again in a moment, or use fetch mode with a specific URL to access pages directly.',
+      citations: [],
+    };
   }
+  const results = data.results;
 
   if (!results || results.length === 0) {
     debugLog('web:search_pages no results', { query: args.query });

From 3968fd733bd7fcee182092146e6379e414e027ac Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:03:41 +0800
Subject: [PATCH 26/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20websites=20and=20workflow=20document=20action?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- REST POST /websites/:id/sync now invokes syncSingleWebsite scoped to
  the specific id instead of org-wide syncWebsiteStatuses. Previously
  the :id path param only served as an ownership tripwire and callers
  got a surprise org-wide side effect.
- fetchPages debounces the inline syncSingleWebsite schedule to 1 hour
  via metadata.lastStatusSyncAt, matching syncWebsiteStatuses' throttle.
  Without this, every page view / poll fanned out a concurrent crawler
  sync that raced last-write-wins on the row's status field.
- fetchHomepageMetadata now console.warn's the crawler HTTP failure so
  blank title/description doesn't look like a real "no metadata" signal
  to operators triaging.
- applyDocxStructured wraps saveFileMetadata in try/catch and deletes
  the orphan _storage blob on metadata-mutation failure. Convex
  _storage is reference-counted only by application rows; a partial
  failure between upload and saveFileMetadata leaked the blob forever.

R10-P2-a (getAccessibleDocumentIds full-org scan in workflow ops)
is deferred — it needs a new hasDocumentAccess(documentId) internal
query and a memo-hoist across consecutive ops. Not blocking at demo
stage scale; tracked separately.
---
 services/platform/convex/websites/actions.ts  | 32 +++++++++++-----
 .../convex/websites/internal_actions.ts       | 10 ++++-
 services/platform/convex/websites/rest_api.ts | 14 ++++++-
 .../document/helpers/apply_docx_structured.ts | 38 +++++++++++++------
 4 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/services/platform/convex/websites/actions.ts b/services/platform/convex/websites/actions.ts
index 9448dda6c1..f301f69fd3 100644
--- a/services/platform/convex/websites/actions.ts
+++ b/services/platform/convex/websites/actions.ts
@@ -271,16 +271,28 @@ export const fetchPages = action({
   handler: async (ctx, args): Promise<FetchPagesResult> => {
     const { website } = await loadOwnedWebsite(ctx, args.websiteId);
 
-    // Trigger async metadata sync from crawler
-    await ctx.scheduler.runAfter(
-      0,
-      internal.websites.internal_actions.syncSingleWebsite,
-      {
-        websiteId: args.websiteId,
-        domain: website.domain,
-        organizationId: website.organizationId,
-      },
-    );
+    // Debounce the crawler sync: every fetchPages call (page view, poll,
+    // tab open) previously scheduled syncSingleWebsite unconditionally,
+    // fanning out N concurrent crawler hits + creating a last-write-wins
+    // race on the row's status field. Mirror the 1-hour throttle that
+    // syncWebsiteStatuses uses via metadata.lastStatusSyncAt (round-3 P2
+    // R9-P2-b).
+    const SYNC_DEBOUNCE_MS = 60 * 60 * 1000;
+    const lastSyncAt =
+      typeof website.metadata?.lastStatusSyncAt === 'number'
+        ? website.metadata.lastStatusSyncAt
+        : 0;
+    if (Date.now() - lastSyncAt > SYNC_DEBOUNCE_MS) {
+      await ctx.scheduler.runAfter(
+        0,
+        internal.websites.internal_actions.syncSingleWebsite,
+        {
+          websiteId: args.websiteId,
+          domain: website.domain,
+          organizationId: website.organizationId,
+        },
+      );
+    }
 
     return await ctx.runAction(
       internal.websites.internal_actions.fetchWebsitePages,
diff --git a/services/platform/convex/websites/internal_actions.ts b/services/platform/convex/websites/internal_actions.ts
index 7b34b031b8..1c77aec63f 100644
--- a/services/platform/convex/websites/internal_actions.ts
+++ b/services/platform/convex/websites/internal_actions.ts
@@ -174,7 +174,15 @@ async function fetchHomepageMetadata(
     30_000,
   );
 
-  if (!res.ok) return null;
+  if (!res.ok) {
+    // Surface the failure so an operator notices that title/description
+    // stayed blank because the homepage fetch failed, not because the
+    // site genuinely has no metadata (round-3 P2 R9-P2-c).
+    console.warn(
+      `[fetchHomepageMetadata] crawler ${res.status} for ${domain} (orgSlug=${orgSlug})`,
+    );
+    return null;
+  }
 
   const data = await res.json();
   const page = data.pages?.[0];
diff --git a/services/platform/convex/websites/rest_api.ts b/services/platform/convex/websites/rest_api.ts
index 2c3a1a959a..13450ad7eb 100644
--- a/services/platform/convex/websites/rest_api.ts
+++ b/services/platform/convex/websites/rest_api.ts
@@ -270,9 +270,19 @@ export const websitePostActions = withRestAuth(
     }
 
     if (subPath === 'sync') {
+      // The :id path param scopes the sync to a single website. The
+      // earlier implementation called syncWebsiteStatuses (whole-org),
+      // making :id load-bearing only as an ownership tripwire — REST
+      // callers got an org-wide side effect when they thought they were
+      // re-syncing one row. Use the per-website action so the contract
+      // matches the URL (round-3 P2 R9-P2-a).
       await rc.ctx.runAction(
-        internal.websites.internal_actions.syncWebsiteStatuses,
-        { organizationId: rc.org.organizationId },
+        internal.websites.internal_actions.syncSingleWebsite,
+        {
+          websiteId: website._id,
+          domain: website.domain,
+          organizationId: rc.org.organizationId,
+        },
       );
 
       return jsonOk({ status: 'syncing' });
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
index 4142fa69dd..2a07cbc62b 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
@@ -171,17 +171,33 @@ export async function applyDocxStructured(
     : `${args.fileName}.docx`;
 
   // Save file metadata so the file shows up in the org's library.
-  await ctx.runMutation(
-    internal.file_metadata.internal_mutations.saveFileMetadata,
-    {
-      organizationId: args.organizationId,
-      storageId,
-      fileName: finalFileName,
-      contentType: DOCX_CONTENT_TYPE,
-      size: docxBytes.length,
-      source: 'agent',
-    },
-  );
+  // Cleanup the just-uploaded _storage blob if the metadata write
+  // fails — without this, a transient mutation failure leaves an
+  // orphan blob in the global _storage namespace with no fileMetadata
+  // pointer (round-3 P2 R10-P2-c).
+  try {
+    await ctx.runMutation(
+      internal.file_metadata.internal_mutations.saveFileMetadata,
+      {
+        organizationId: args.organizationId,
+        storageId,
+        fileName: finalFileName,
+        contentType: DOCX_CONTENT_TYPE,
+        size: docxBytes.length,
+        source: 'agent',
+      },
+    );
+  } catch (err) {
+    try {
+      await ctx.storage.delete(storageId);
+    } catch (deleteErr) {
+      console.warn(
+        `[applyDocxStructured] orphan-blob cleanup failed for ${storageId}:`,
+        deleteErr instanceof Error ? deleteErr.message : deleteErr,
+      );
+    }
+    throw err;
+  }
 
   const downloadUrl = buildDownloadUrl(storageId, finalFileName);
 

From 525132042ef990f134a6a3079ef4ad52d8198a6b Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:07:13 +0800
Subject: [PATCH 27/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20governance,=20scaffold,=20integration=20consistency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- retention_cleanup.ts cleanupLoginAttemptsGlobal now carries an
  explicit comment documenting the legal-hold trade-off: the 30-day
  fixed TTL sweep intentionally does NOT cross-check active holds
  because pulling email→userId→cross-org-holds resolution back into a
  global sweep would re-introduce the per-org coupling the Phase 11
  reframe deliberately removed. Forensics relevant to a hold live in
  the per-org auditLogs stream, which IS hold-gated.

- scaffold.ts seedRetention: non-ENOENT stat errors (EACCES on a
  chmod-locked file, EPERM on an immutable-bit attribute, ELOOP on a
  symlink cycle) previously fell through and silently overwrote the
  locked file. Treat unknown stat failures as "target exists" so the
  override:false branch refuses, and surface the message in the result
  so a deploy reports the failure instead of producing a silent clobber.

- loadIntegration: orgSlug and organizationId were trusted independently
  even though they drive different reads (orgSlug → filesystem config;
  organizationId → DB credentials). A mismatched pair would silently
  splice org A's config template with org B's encrypted secrets. Resolve
  canonically via orgSlugFromId and refuse on mismatch — backward-
  compatible with every current caller while closing the consistency
  invariant.

R12-P2-c (open-run watchdog) is already handled by the existing
STALE_RUN_AGE_MS / STALE_HEARTBEAT_MS reclaim path in claimRetentionRun
— no code change needed.
---
 .../convex/governance/retention_cleanup.ts    | 11 ++++++++++
 .../convex/integrations/load_integration.ts   | 13 ++++++++++++
 .../platform/convex/organizations/scaffold.ts | 20 +++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/services/platform/convex/governance/retention_cleanup.ts b/services/platform/convex/governance/retention_cleanup.ts
index cc17caaa3a..c094663af8 100644
--- a/services/platform/convex/governance/retention_cleanup.ts
+++ b/services/platform/convex/governance/retention_cleanup.ts
@@ -1332,6 +1332,17 @@ async function cleanupNotifications(
  *
  * `cleanupLoginAttemptsGlobal` runs unconditionally now (no per-org
  * opt-in), once per dispatcher invocation, with the fixed TTL.
+ *
+ * **Legal-hold interaction (round-3 P2 R12-P2-b)**: The global sweep
+ * intentionally does NOT cross-check `loadActiveHolds`. The tables are
+ * email-keyed and global; resolving each row's email → userId → all
+ * orgs → active holds on every sweep would re-introduce the per-org
+ * coupling the reframe deliberately removed. The trade-off (a custodian
+ * hold in org X does not protect that user's `loginAttempts` rows from
+ * the 30-day TTL) is accepted: forensics relevant to a hold live in
+ * `auditLogs`, which IS hold-gated and honors per-org retention. Document
+ * this in the data-handling runbook rather than pull operational state
+ * back into the legal-hold scope.
  */
 const LOGIN_ATTEMPTS_FIXED_TTL_DAYS = 30;
 
diff --git a/services/platform/convex/integrations/load_integration.ts b/services/platform/convex/integrations/load_integration.ts
index 675510291f..e59d8616ab 100644
--- a/services/platform/convex/integrations/load_integration.ts
+++ b/services/platform/convex/integrations/load_integration.ts
@@ -16,6 +16,7 @@ import { v } from 'convex/values';
 import { internal } from '../_generated/api';
 import type { Doc } from '../_generated/dataModel';
 import { internalAction } from '../_generated/server';
+import { orgSlugFromId } from '../lib/helpers/org_slug';
 
 export interface LoadedIntegration {
   _id: Doc<'integrationCredentials'>['_id'];
@@ -127,6 +128,18 @@ export const loadIntegration = internalAction({
   },
   returns: v.any(),
   handler: async (ctx, args): Promise<LoadedIntegration | null> => {
+    // The two args drive different reads: orgSlug → filesystem config,
+    // organizationId → DB credentials. A future caller that mismatches
+    // them (or a refactor that drifts the resolution) would silently
+    // splice one org's config template with another org's encrypted
+    // secrets. Resolve canonically from organizationId and refuse on
+    // mismatch (round-3 P2 R15-P2-a).
+    const canonicalSlug = await orgSlugFromId(ctx, args.organizationId);
+    if (canonicalSlug !== args.orgSlug) {
+      throw new Error(
+        `[loadIntegration] orgSlug/${args.orgSlug} does not match canonical slug ${canonicalSlug} for organizationId ${args.organizationId}`,
+      );
+    }
     const [fileResult, credentials] = await Promise.all([
       ctx.runAction(
         internal.integrations.file_actions.readIntegrationForExecution,
diff --git a/services/platform/convex/organizations/scaffold.ts b/services/platform/convex/organizations/scaffold.ts
index 607a466b8c..ac197cd818 100644
--- a/services/platform/convex/organizations/scaffold.ts
+++ b/services/platform/convex/organizations/scaffold.ts
@@ -429,11 +429,23 @@ async function seedRetention(
       targetStatErr = err;
       return false;
     });
+  // Round-3 P2 R14-P2-a: non-ENOENT stat errors (EACCES on a chmod-locked
+  // file, EPERM on an immutable-bit attribute, ELOOP on a symlink cycle)
+  // previously fell through and silently overwrote whatever the operator
+  // had locked. Treat unknown stat failures as "target exists" so the
+  // override:false branch refuses, and bubble the error code through the
+  // result so a deploy can surface it instead of producing a silent
+  // clobber.
   if (!targetExists && errnoCode(targetStatErr) !== 'ENOENT' && targetStatErr) {
-    console.warn(
-      `[scaffold] retention: stat ${targetFile} failed:`,
-      targetStatErr,
-    );
+    const message = `[scaffold] retention: stat ${targetFile} failed: ${
+      targetStatErr instanceof Error
+        ? targetStatErr.message
+        : String(targetStatErr)
+    }`;
+    console.warn(message);
+    if (!override) {
+      return { domain: 'retention', ok: false, error: message };
+    }
   }
   if (targetExists && !override) {
     console.warn(

From a9416045fc79986e780141c7a19bdd1e8e7c2882 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:09:35 +0800
Subject: [PATCH 28/41] =?UTF-8?q?fix(platform):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20server=20URL=20parsing,=20config-watcher,=20documen?=
 =?UTF-8?q?ts-table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- convexHttpActionsBaseUrl now parses CONVEX_URL via new URL() and sets
  parsed.port = '3211' explicitly. The previous `:\\d+$` regex only
  matched URLs ending in a literal port; operators with a bare
  hostname (`https://convex.example.com`) or a path suffix
  (`http://convex:3210/sub`) silently got the wrong port and every
  SSE auth lookup 401'd.

- config-watcher's single-file-per-org branch now consults a
  SINGLE_FILE_ORG_CONFIGS set rather than a hardcoded `stem === 'retention'`
  comparison. Adding a future per-org config file (`quota.json`, etc.) is
  now a one-line change instead of silently no-op'ing because the watcher
  doesn't recognize the stem.

- documents-table.tsx eager-pagination predicate (`hasActiveQuery`) now
  includes the context-level `selectedTeamId`. The page-level team filter
  feeds into filterDocumentResults too, so without this the user could
  pick a team in the page filter (no other filters/search), see only the
  first page of results matching that team, and have no way to scroll the
  rest into view.
---
 .../documents/components/documents-table.tsx  |  7 ++++-
 services/platform/lib/config-watcher.ts       | 29 ++++++++++++++-----
 services/platform/server.ts                   | 17 ++++++++++-
 3 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/services/platform/app/features/documents/components/documents-table.tsx b/services/platform/app/features/documents/components/documents-table.tsx
index 0f77bb337f..e8c0ce412d 100644
--- a/services/platform/app/features/documents/components/documents-table.tsx
+++ b/services/platform/app/features/documents/components/documents-table.tsx
@@ -75,11 +75,16 @@ export function DocumentsTable({
   // scroll while a query is active, so further pages never load and any match
   // beyond the first page reads as "no results". Eagerly pull every page while
   // a search/filter is active so the client-side filter sees the full set.
+  // Includes `selectedTeamId` from the page-level team filter context —
+  // filterDocumentResults reads it too, so omitting it from this predicate
+  // means a context-only filter (no search, no local filters) still showed
+  // only the first page (round-3 P2 R19-P2-a).
   const hasActiveQuery =
     debouncedQuery.trim().length > 0 ||
     selectedRagStatuses.length > 0 ||
     selectedSources.length > 0 ||
-    selectedTeamIds.length > 0;
+    selectedTeamIds.length > 0 ||
+    selectedTeamId != null;
 
   const { status: pageStatus, loadMore: loadMorePage } = paginatedResult;
   useEffect(() => {
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index 1b30c8e418..e92a447aa0 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -27,6 +27,15 @@ const ATOMIC_WRITE_TMP_RE = /\.\d+\.[a-f0-9]{8}\.tmp$/;
  */
 const EMIT_DEBOUNCE_MS = 100;
 
+/**
+ * Stems allowed at `<org>/<stem>.json` (single-file-per-org configs).
+ * Must stay in lockstep with the read-side resolvers — adding a new
+ * entry here without a matching reader means the watcher emits events
+ * nothing consumes, and adding a reader without an entry here means
+ * operator edits silently never invalidate caches.
+ */
+const SINGLE_FILE_ORG_CONFIGS: ReadonlySet<string> = new Set(['retention']);
+
 /**
  * Parse a relative path within the config directory into a structured event,
  * under the uniform org-first layout `${TALE_CONFIG_DIR}/<orgSlug>/<domain>/...`.
@@ -61,16 +70,20 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   const orgSlug = parts[0];
   if (!ORG_SLUG_REGEX.test(orgSlug)) return null;
 
-  // Single-file-per-org configs sit at `<org>/<stem>.json` (currently
-  // just `retention.json`; future-proof for `quota.json` etc.). Without
-  // this branch they fell through to null, so operator edits to
-  // `<org>/retention.json` never invalidated the governance UI cache
-  // (round-2 P1-15). Emit at slug=stem granularity so consumers can
-  // key their cache invalidation on it.
+  // Single-file-per-org configs sit at `<org>/<stem>.json`. The allowed
+  // stems are listed in SINGLE_FILE_ORG_CONFIGS so adding a new sibling
+  // (e.g. `quota.json`) is a one-line change here AND in the read-side
+  // resolver — they must stay in lockstep. Previously hardcoded to
+  // `retention` only; any future stem silently no-op'd (round-3 P2
+  // R18-P2-d).
   if (parts.length === 2 && parts[1].endsWith('.json')) {
     const stem = parts[1].slice(0, -'.json'.length);
-    if (stem === 'retention') {
-      return { type: 'retention', orgSlug, slug: stem };
+    if (SINGLE_FILE_ORG_CONFIGS.has(stem)) {
+      return {
+        type: stem as ConfigChangeEvent['type'],
+        orgSlug,
+        slug: stem,
+      };
     }
     return null;
   }
diff --git a/services/platform/server.ts b/services/platform/server.ts
index f56f468e8e..0ccd438801 100644
--- a/services/platform/server.ts
+++ b/services/platform/server.ts
@@ -93,7 +93,22 @@ function convexHttpActionsBaseUrl(): string {
     return process.env.CONVEX_SITE_PROXY_URL.replace(/\/$/, '');
   }
   const wsUrl = process.env.CONVEX_URL ?? 'http://convex:3210';
-  return wsUrl.replace(/:\d+$/, ':3211').replace(/\/$/, '');
+  // Parse via URL() so the rewrite works for bare hostnames
+  // (`https://convex.example.com` → no explicit port) and URLs with
+  // path suffixes (`http://convex:3210/sub`) — the previous regex
+  // `:\d+$` only matched the literal trailing-port shape and would
+  // silently leave the wrong port in place for any operator who set
+  // CONVEX_URL to anything else. Falls back to the original string if
+  // parsing fails (defensive — should be unreachable).
+  try {
+    const parsed = new URL(wsUrl);
+    parsed.port = '3211';
+    // URL toString preserves protocol, host, path; strip any trailing
+    // slash for symmetry with CONVEX_SITE_PROXY_URL handling above.
+    return parsed.toString().replace(/\/$/, '');
+  } catch {
+    return wsUrl.replace(/:\d+$/, ':3211').replace(/\/$/, '');
+  }
 }
 
 async function resolveAllowedOrgSlugs(

From 2307493d8e38c48577b3b1260f30ddf7e0018b2b Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:12:31 +0800
Subject: [PATCH 29/41] =?UTF-8?q?fix(rag):=20p2=20hardening=20=E2=80=94=20?=
 =?UTF-8?q?dead=20code,=20fullmatch,=20dim-pin=20cache,=20shutdown?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Delete services/rag/app/utils/sanitize.py — sanitize_org_slug had
  zero call sites in the RAG service, no tests, and its charset (which
  accepted uppercase) diverged from the canonical validate_org_slug at
  packages/tale_shared/.../org_slug.py. Dead module.

- auth.py require_org_slug now uses ORG_SLUG_RE.fullmatch instead of
  .match for parity with the canonical validator. The regex carries
  explicit ^...$ anchors so this only matters for the trailing-\\n
  edge case today, but the canonical contract is fullmatch — keep it
  in lockstep.

- database.pin_embedding_dimensions now also pins the
  semantic_cache.query_embedding column when the table exists.
  Previously declared as plain vector (any-dim) and never aligned —
  on a dim change the next lookup's `<=>` operator raised
  "different vector dimensions", was silently swallowed by the
  generic exception handler, and all subsequent cache reads returned
  None until manual purge. TRUNCATE on mismatch because `<=>` can't
  be coerced across dims.

- rag_service _safe_close uses an interruptible asyncio.Event-based
  sleep instead of plain asyncio.sleep(30). shutdown() now sets the
  event before draining the background-task pool, so the wrapped
  close coroutine runs immediately instead of being cancelled
  mid-sleep when the 10s drain timeout fires. Previously each
  refresh-evicted client's httpx pool leaked through process exit.
---
 services/rag/app/auth.py                 |  6 +++-
 services/rag/app/services/database.py    | 31 ++++++++++++++++++
 services/rag/app/services/rag_service.py | 40 ++++++++++++++++++++++--
 services/rag/app/utils/sanitize.py       | 27 ----------------
 4 files changed, 74 insertions(+), 30 deletions(-)
 delete mode 100644 services/rag/app/utils/sanitize.py

diff --git a/services/rag/app/auth.py b/services/rag/app/auth.py
index d3a23f8af0..8caf5614c0 100644
--- a/services/rag/app/auth.py
+++ b/services/rag/app/auth.py
@@ -69,7 +69,11 @@ async def require_org_slug(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="missing X-Tale-Org header",
         )
-    if not ORG_SLUG_RE.match(x_tale_org):
+    # `fullmatch` rather than `match` so a trailing `\n` (which `$` would
+    # accept) is rejected. Canonical validator at
+    # `packages/tale_shared/src/tale_shared/config/org_slug.py:validate_org_slug`
+    # uses fullmatch — keep this in lockstep. Round-3 P2 R21-P2-c.
+    if not ORG_SLUG_RE.fullmatch(x_tale_org):
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="invalid X-Tale-Org header",
diff --git a/services/rag/app/services/database.py b/services/rag/app/services/database.py
index eaf612a6f7..22e7b422fe 100644
--- a/services/rag/app/services/database.py
+++ b/services/rag/app/services/database.py
@@ -111,3 +111,34 @@ async def pin_embedding_dimensions(pool: asyncpg.Pool, dimensions: int) -> None:
                 "Vector search will use sequential scan. Consider reducing dimensions.",
                 dimensions,
             )
+
+        # Round-3 P2 R20-P2-a: pin the semantic_cache query_embedding
+        # column to the same dimensions. Previously declared as plain
+        # `vector` (any-dim) and never aligned; a dim change between
+        # deploys left stale rows whose pgvector `<=>` operator threw
+        # "different vector dimensions" on every subsequent lookup,
+        # silently swallowed by the SELECT's generic exception handler.
+        # TRUNCATE on mismatch because `<=>` can't be coerced across
+        # dims — we'd otherwise still error on existing rows.
+        try:
+            cache_col_type = await conn.fetchval(
+                """
+                SELECT format_type(atttypid, atttypmod)
+                FROM pg_attribute
+                WHERE attrelid = $1::regclass AND attname = 'query_embedding'
+                """,
+                f"{SCHEMA}.semantic_cache",
+            )
+        except asyncpg.exceptions.UndefinedTableError:
+            # semantic_cache is created lazily on first use; nothing to pin yet.
+            cache_col_type = None
+        if cache_col_type is not None and cache_col_type != expected_type:
+            logger.info(
+                "Pinning {}.semantic_cache.query_embedding to vector({}); truncating stale rows",
+                SCHEMA,
+                dimensions,
+            )
+            await conn.execute(f"TRUNCATE TABLE {SCHEMA}.semantic_cache")
+            await conn.execute(
+                f"ALTER TABLE {SCHEMA}.semantic_cache ALTER COLUMN query_embedding TYPE vector({dimensions})",
+            )
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index 741f116e38..63faf2be76 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -77,10 +77,41 @@
 
 _background_tasks: set[asyncio.Task[None]] = set()
 
+# When set, every pending `_safe_close` skips its remaining grace
+# window and proceeds to the close call immediately. Shutdown sets
+# this before draining so the underlying httpx pools actually close
+# even when the drain timeout fires — previously the 10s drain
+# cancelled the 30s `asyncio.sleep` mid-flight and the close
+# coroutine never ran, leaking sockets through process exit
+# (round-3 P2 R20-P2-d).
+_shutdown_event: asyncio.Event | None = None
+
+
+def _get_shutdown_event() -> asyncio.Event:
+    """Lazy-construct the per-event-loop shutdown event.
+
+    Created on first use rather than at import time so we don't grab a
+    handle to the wrong event loop in test environments that spin up
+    fresh loops per case.
+    """
+    global _shutdown_event
+    if _shutdown_event is None:
+        _shutdown_event = asyncio.Event()
+    return _shutdown_event
+
 
 async def _safe_close(coro) -> None:
-    """Close an old client after a grace period for in-flight requests."""
-    await asyncio.sleep(30)
+    """Close an old client after a grace period for in-flight requests.
+
+    The grace is interruptible: when `_shutdown_event` fires, the sleep
+    aborts early and the close runs immediately. Without this, a
+    bounded shutdown drain would cancel the `asyncio.sleep(30)` and the
+    wrapped close coroutine would never be awaited.
+    """
+    try:
+        await asyncio.wait_for(_get_shutdown_event().wait(), timeout=30)
+    except TimeoutError:
+        pass
     try:
         await coro
     except Exception:
@@ -873,6 +904,11 @@ async def shutdown(self) -> None:
         4. Close the DB pool.
         """
         self._shutting_down = True
+        # Wake every pending `_safe_close` so the underlying client
+        # close runs without waiting out its 30s grace — pairs with the
+        # interruptible sleep in `_safe_close` to ensure httpx pools are
+        # actually torn down before the drain timeout fires.
+        _get_shutdown_event().set()
 
         # Best-effort close of each org's clients before tearing down the pool.
         for org_slug, clients in list(self._org_clients.items()):
diff --git a/services/rag/app/utils/sanitize.py b/services/rag/app/utils/sanitize.py
deleted file mode 100644
index e62d397a88..0000000000
--- a/services/rag/app/utils/sanitize.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""Org slug sanitization for multi-tenant document storage."""
-
-import re
-
-
-def sanitize_org_slug(org_slug: str) -> str:
-    """Sanitize an org_slug by replacing invalid characters.
-
-    - Spaces and dots replaced with underscores
-    - Non-alphanumeric/underscore/hyphen characters removed
-    - Collapses multiple underscores, strips leading/trailing underscores
-
-    Raises:
-        ValueError: If org_slug sanitizes to empty string.
-    """
-    if not org_slug:
-        raise ValueError("org_slug must not be empty")
-
-    result = org_slug.replace(" ", "_").replace(".", "_")
-    result = re.sub(r"[^a-zA-Z0-9_-]", "", result)
-    result = re.sub(r"_+", "_", result)
-    result = result.strip("_")
-
-    if not result:
-        raise ValueError(f"org_slug '{org_slug}' sanitized to empty string")
-
-    return result

From 25a68dc7918cae43b567669d4d8c87e68ac61475 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:15:52 +0800
Subject: [PATCH 30/41] =?UTF-8?q?fix(crawler):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20indexing,=20vision,=20lifespan?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- delete_page_chunks: `domain` parameter is now required. The previous
  `domain=None` branch issued `DELETE FROM chunks WHERE url=$1` which
  spans every domain that ever ingested the path, silently
  over-deleting another org's chunks on shared paths like `/about`.
  No production caller relied on the omit-domain behavior; only legacy
  tests did.

- vision/openai_client.process_pages_with_llm: per-chunk LLM failure
  now logs at error level (was warning) AND prepends an explicit
  `[LLM_EXTRACTION_FAILED: <type>]` marker to the returned chunk.
  Downstream storage / indexing can now distinguish "LLM extracted
  this" from "LLM died, this is the raw input pretending to be
  extraction".

- vision/openai_client._safe_close_client and
  embedding_service._close_old grace window extended from 30s to
  300s. Vision requests can run up to 180s (vision_request_timeout)
  and chat completions can run for ~300s; the previous 30s window
  tore down the httpx pool while a long PDF OCR was still in flight.

- vision/openai_client.process_pages_with_llm cache_key now includes
  the resolved client.base_url so an in-org provider rotation (same
  model id, different upstream) doesn't serve cached outputs from
  the previous provider.

- main.py lifespan teardown drains per-org client caches (_org_states
  in embedding_service; _chat_states and _vision_states in
  vision/openai_client) under a 10s bound. Previously each held an
  AsyncOpenAI httpx pool that was reclaimed only at process exit,
  producing noisy "Event loop is closed" tracebacks under
  uvicorn --reload / docker rolling restart.

- migrations/20260528000000_add_website_org_memberships.sql now
  documents the implicit `default` org assumption so an operator
  with a non-default-only layout has a clear signal when bounded_scan
  errors on the missing provider catalog.
---
 services/crawler/app/main.py                  | 42 +++++++++++++++++++
 .../crawler/app/services/embedding_service.py | 11 ++++-
 .../crawler/app/services/indexing_service.py  | 25 ++++++-----
 .../app/services/vision/openai_client.py      | 34 ++++++++++++---
 ...0528000000_add_website_org_memberships.sql |  8 ++++
 .../crawler/tests/test_indexing_service.py    |  6 +--
 6 files changed, 103 insertions(+), 23 deletions(-)

diff --git a/services/crawler/app/main.py b/services/crawler/app/main.py
index 2daf780866..4c86154395 100644
--- a/services/crawler/app/main.py
+++ b/services/crawler/app/main.py
@@ -178,6 +178,48 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
     except Exception:
         logger.exception("Failed to cleanup crawler service")
 
+    # Drain per-org client caches so the httpx pools they hold close
+    # cleanly. Without this, graceful shutdown leaks FDs and produces
+    # noisy "Event loop is closed" tracebacks under uvicorn --reload
+    # / docker rolling restart. Each close is bounded by 10s so a
+    # hung peer can't pin shutdown. Round-3 P2 R26-P2-c.
+    async def _drain_org_caches() -> None:
+        from app.services.embedding_service import _org_states as _emb_states  # type: ignore[attr-defined]
+        from app.services.vision.openai_client import (  # type: ignore[attr-defined]
+            _chat_states,
+            _vision_states,
+        )
+
+        async def _safe(close_aw):
+            try:
+                await close_aw
+            except Exception:
+                logger.opt(exception=True).warning("Failed to close per-org client during shutdown")
+
+        closes = []
+        for state in _emb_states.values():
+            closes.append(_safe(state.service.close()))
+        for state in _vision_states.values():
+            closes.append(_safe(state.client.close()))
+        for state in _chat_states.values():
+            closes.append(_safe(state.client.close()))
+        _emb_states.clear()
+        _vision_states.clear()
+        _chat_states.clear()
+        if closes:
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*closes, return_exceptions=True),
+                    timeout=10,
+                )
+            except TimeoutError:
+                logger.warning("Per-org client drain did not finish within 10s; continuing")
+
+    try:
+        await _drain_org_caches()
+    except Exception:
+        logger.exception("Failed to drain per-org client caches")
+
     shutdown_telemetry()
 
 
diff --git a/services/crawler/app/services/embedding_service.py b/services/crawler/app/services/embedding_service.py
index bb44e3ebd2..f676c7e96a 100644
--- a/services/crawler/app/services/embedding_service.py
+++ b/services/crawler/app/services/embedding_service.py
@@ -63,8 +63,15 @@ def _evict_lru_if_needed() -> None:
 
 
 async def _close_old(service: EmbeddingService) -> None:
-    """Close an old client after a grace period for in-flight requests."""
-    await asyncio.sleep(30)
+    """Close an old client after a grace period for in-flight requests.
+
+    Matches the 300s window used for the chat/vision clients
+    (`vision/openai_client.py:_safe_close_client`). The previous 30s
+    grace was shorter than a long batch embed could legitimately
+    take; tearing down the httpx pool mid-flight produced opaque
+    "Event loop is closed" errors. Round-3 P2 R26-P2-b.
+    """
+    await asyncio.sleep(300)
     try:
         await service.close()
     except Exception:
diff --git a/services/crawler/app/services/indexing_service.py b/services/crawler/app/services/indexing_service.py
index 131dc36cf3..6c9981143e 100644
--- a/services/crawler/app/services/indexing_service.py
+++ b/services/crawler/app/services/indexing_service.py
@@ -254,19 +254,18 @@ async def _index_one(row: asyncpg.Record) -> dict:
             "total_chunks": total_chunks,
         }
 
-    async def delete_page_chunks(self, url: str, domain: str | None = None) -> int:
-        # `domain` is optional for backwards compatibility — existing
-        # callers that don't pass it get the wider (URL-only) delete.
-        # New callers should pass it so two domains sharing a path
-        # don't over-delete each other's chunks.
+    async def delete_page_chunks(self, url: str, domain: str) -> int:
+        # `domain` is now REQUIRED (round-3 P2 R25-P1). The previous
+        # `domain=None` branch issued `DELETE FROM chunks WHERE url=$1`
+        # which spans every domain that ever ingested the path —
+        # silently over-deleting another org's chunks on shared paths
+        # like `/about` or `/index`. No production caller relied on
+        # the omit-domain behavior; only legacy tests did.
         async with acquire_with_retry(self._pool) as conn:
-            if domain is None:
-                result = await conn.execute("DELETE FROM chunks WHERE url = $1", url)
-            else:
-                result = await conn.execute(
-                    "DELETE FROM chunks WHERE domain = $1 AND url = $2",
-                    domain,
-                    url,
-                )
+            result = await conn.execute(
+                "DELETE FROM chunks WHERE domain = $1 AND url = $2",
+                domain,
+                url,
+            )
             count = int(result.split()[-1]) if result else 0
             return count
diff --git a/services/crawler/app/services/vision/openai_client.py b/services/crawler/app/services/vision/openai_client.py
index 41f42a176b..1a7ac2f075 100644
--- a/services/crawler/app/services/vision/openai_client.py
+++ b/services/crawler/app/services/vision/openai_client.py
@@ -110,8 +110,15 @@ def __init__(
 
 
 async def _safe_close_client(client: AsyncOpenAI) -> None:
-    """Close an old client after a grace period for in-flight requests."""
-    await asyncio.sleep(30)
+    """Close an old client after a grace period for in-flight requests.
+
+    Grace window must cover the longest in-flight request the client
+    could be servicing. Vision requests can run up to
+    `vision_request_timeout=180s` and chat completions can run for up
+    to ~300s; 30s was too short and would tear down the httpx pool
+    while a long PDF OCR was still in flight (round-3 P2 R26-P2-b).
+    """
+    await asyncio.sleep(300)
     try:
         await client.close()
     except Exception:
@@ -498,8 +505,15 @@ async def process_pages_with_llm(
 
     logger.info(f"Split into {total_chunks} chunks for LLM processing")
 
+    # Resolve base_url for the cache key so a within-org provider
+    # rotation (same model id, different upstream) doesn't serve stale
+    # cached outputs from the previous provider. Round-3 P2 R26-P2-d.
+    cached_chat_base_url = str(getattr(client, "base_url", "") or "")
+
     async def process_chunk(chunk_idx: int, chunk_text: str) -> tuple[int, str]:
-        cache_key = compute_text_hash(chunk_text + "\n---\n" + user_input + "\n---\n" + resolved_model)
+        cache_key = compute_text_hash(
+            chunk_text + "\n---\n" + user_input + "\n---\n" + resolved_model + "\n---\n" + cached_chat_base_url,
+        )
         cached = llm_cache.get_llm(cache_key)
         if cached is not None:
             logger.info(f"LLM chunk {chunk_idx + 1}/{total_chunks} cache hit ({len(chunk_text)} chars)")
@@ -534,8 +548,18 @@ async def process_chunk(chunk_idx: int, chunk_text: str) -> tuple[int, str]:
                 logger.info(f"LLM chunk {chunk_idx + 1}/{total_chunks} done: {len(chunk_text)} -> {len(result)} chars")
                 return chunk_idx, result
             except Exception as e:
-                logger.warning(f"Failed to process chunk {chunk_idx + 1} with LLM: {e}")
-                return chunk_idx, chunk_text
+                # Round-3 P2 R26-P2-a: log at error level (was warning)
+                # and prepend an explicit failure marker so downstream
+                # storage / indexing can spot extractions that fell
+                # back to raw content. Previously the caller couldn't
+                # distinguish "LLM extracted this" from "LLM died,
+                # this is the raw input pretending to be extraction".
+                logger.error(
+                    f"Failed to process chunk {chunk_idx + 1} with LLM ({type(e).__name__}: {e}); "
+                    f"returning raw content with failure marker",
+                )
+                marker = f"[LLM_EXTRACTION_FAILED: {type(e).__name__}]\n"
+                return chunk_idx, marker + chunk_text
 
     tasks = [process_chunk(idx, text) for idx, text in chunks]
     results = await asyncio.gather(*tasks)
diff --git a/services/crawler/migrations/20260528000000_add_website_org_memberships.sql b/services/crawler/migrations/20260528000000_add_website_org_memberships.sql
index b2047a6cb4..2c23230178 100644
--- a/services/crawler/migrations/20260528000000_add_website_org_memberships.sql
+++ b/services/crawler/migrations/20260528000000_add_website_org_memberships.sql
@@ -27,6 +27,14 @@ CREATE TABLE IF NOT EXISTS public_web.website_org_memberships (
 CREATE INDEX IF NOT EXISTS idx_website_org_memberships_by_org
     ON public_web.website_org_memberships (org_slug);
 
+-- Backfill assumes a `default` org exists. This holds in every shipped
+-- platform deployment — the entrypoint always seeds a `default` org as
+-- the bootstrap target — but operators with an unusual layout (e.g.
+-- seeded only non-default orgs) will see `bounded_scan` crash on
+-- `settings.get_embedding_config('default')` for the migrated rows.
+-- The scheduler's first scan attempt is the actionable signal: it
+-- logs a clear error and the operator manually rewrites org_slug
+-- on the migrated rows.
 INSERT INTO public_web.website_org_memberships (domain, org_slug)
 SELECT domain, 'default'
 FROM public_web.websites
diff --git a/services/crawler/tests/test_indexing_service.py b/services/crawler/tests/test_indexing_service.py
index 9d3d026bcf..1616d91b9f 100644
--- a/services/crawler/tests/test_indexing_service.py
+++ b/services/crawler/tests/test_indexing_service.py
@@ -276,21 +276,21 @@ class TestDeletePageChunks:
     async def test_returns_deleted_count(self, indexing_service, mock_conn):
         mock_conn.execute = AsyncMock(return_value="DELETE 5")
 
-        count = await indexing_service.delete_page_chunks("https://example.com/page")
+        count = await indexing_service.delete_page_chunks("https://example.com/page", "example.com")
 
         assert count == 5
 
     async def test_returns_zero_when_no_rows_deleted(self, indexing_service, mock_conn):
         mock_conn.execute = AsyncMock(return_value="DELETE 0")
 
-        count = await indexing_service.delete_page_chunks("https://example.com/page")
+        count = await indexing_service.delete_page_chunks("https://example.com/page", "example.com")
 
         assert count == 0
 
     async def test_returns_zero_when_result_is_empty(self, indexing_service, mock_conn):
         mock_conn.execute = AsyncMock(return_value="")
 
-        count = await indexing_service.delete_page_chunks("https://example.com/page")
+        count = await indexing_service.delete_page_chunks("https://example.com/page", "example.com")
 
         assert count == 0
 

From 0b6f38acc4c60c1f71c0d4157354619608e591f0 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:19:47 +0800
Subject: [PATCH 31/41] =?UTF-8?q?fix(cli,convex):=20p2=20hardening=20?=
 =?UTF-8?q?=E2=80=94=20migrate,=20deploy,=20entrypoint,=20volume=20script?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- migrate-config-layout: copy_secret stages via .tale-migrate.<pid>
  tmp + mv -f so a SIGINT mid-copy can't leave a half-written dst
  that the cmp-s guard then refuses to overwrite (operator stuck).
- migrate-config-layout: detect_default_dst_collisions now exits
  non-zero with a clear "MIGRATE_ABORT" message BEFORE process_secret
  iterates. Previously the script logged the conflict but proceeded;
  whichever source iterated first won and end-state depended on dir
  order.
- deploy.ts ORG_SLUG_REGEX gains the {0,63} length cap to match the
  shared platform constant + the dev-compose generator. Without this
  the deploy-side enumerator would push slugs the platform itself
  refuses to mint.
- docker-entrypoint.sh: new atomic_cp_bundle stages bundle dirs into
  .tale-seed.<pid> then renames over dest. Previously integrations
  and skills used raw `cp -r` which left half-populated bundles on
  interruption that the next-run `[ -d dest ]` probe then skipped
  permanently as "already seeded".
- docker-entrypoint.sh workflows loop replaces `&&` + `; continue`
  with an explicit if/else so set -e + the trailing semicolon don't
  silently swallow mkdir/atomic_cp failures. Failed seeds now hit
  log_error instead of producing neither a ✓ nor an error line.
- 2026-03-28-migrate-convex-data.sh accepts --old-volume / --new-volume
  flags and defaults to COMPOSE_PROJECT_NAME-derived names. The
  previous hardcoded `tale_…` shape silently skipped the migration
  for any operator running compose with a different -p name.
---
 scripts/2026-03-28-migrate-convex-data.sh     | 31 ++++++++++-
 services/convex/docker-entrypoint.sh          | 53 +++++++++++++------
 tools/cli/src/lib/actions/deploy.ts           | 12 +++--
 .../src/lib/actions/migrate-config-layout.ts  | 24 ++++++++-
 .../generators/generate-dev-compose.ts        |  8 ++-
 5 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/scripts/2026-03-28-migrate-convex-data.sh b/scripts/2026-03-28-migrate-convex-data.sh
index b377e605d7..fecb882c30 100755
--- a/scripts/2026-03-28-migrate-convex-data.sh
+++ b/scripts/2026-03-28-migrate-convex-data.sh
@@ -24,8 +24,32 @@
 # ============================================================================
 set -euo pipefail
 
-OLD_VOLUME="tale_platform-convex-data"
-NEW_VOLUME="tale_platform-data"
+# Volume names default to the `tale` compose-project shape. Operators
+# running `docker compose -p mycompany …` (common when multiple Tale
+# stacks share a host) need to supply their actual volume names via
+# `--old-volume` / `--new-volume`, or set COMPOSE_PROJECT_NAME — the
+# previous hardcoded `tale_…` silently skipped the migration with a
+# "old volume not found" message that looked like a clean no-op.
+# Round-3 P2 R32-P2-d.
+COMPOSE_PROJECT="${COMPOSE_PROJECT_NAME:-tale}"
+OLD_VOLUME="${COMPOSE_PROJECT}_platform-convex-data"
+NEW_VOLUME="${COMPOSE_PROJECT}_platform-data"
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --old-volume) OLD_VOLUME="$2"; shift 2 ;;
+    --new-volume) NEW_VOLUME="$2"; shift 2 ;;
+    --help|-h)
+      echo "Usage: $0 [--old-volume <name>] [--new-volume <name>]"
+      echo ""
+      echo "Defaults to <COMPOSE_PROJECT_NAME or 'tale'>_platform-convex-data and"
+      echo "<COMPOSE_PROJECT_NAME or 'tale'>_platform-data. Run \`docker volume ls\` to find"
+      echo "your project's actual volume names."
+      exit 0 ;;
+    *) echo "Unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
 DIRS_TO_MIGRATE="modules files exports snapshot_imports"
 
 echo "📦 Convex data migration (2026-03-28)"
@@ -45,6 +69,9 @@ fi
 
 if ! docker volume inspect "$NEW_VOLUME" > /dev/null 2>&1; then
   echo "❌ New volume '$NEW_VOLUME' not found. Run 'docker compose up -d' first to create it."
+  echo ""
+  echo "Available volumes (use --new-volume <name> if yours has a different prefix):"
+  docker volume ls --format '  - {{ .Name }}' | head -20
   exit 1
 fi
 
diff --git a/services/convex/docker-entrypoint.sh b/services/convex/docker-entrypoint.sh
index 19924b1fb6..aba859b2fb 100755
--- a/services/convex/docker-entrypoint.sh
+++ b/services/convex/docker-entrypoint.sh
@@ -347,6 +347,20 @@ atomic_cp() {
   cp "$src" "$tmp" && mv -f "$tmp" "$dest"
 }
 
+# Crash-safe directory copy: stage into a sibling `.tale-seed.<pid>` dir
+# then atomically rename over the destination after rm-ing any prior
+# dest. cp -r alone leaves a half-populated dest on interruption, and
+# the next-run `[ -d "$dest" ]` check then treats the partial bundle
+# as "already seeded" and skips it permanently. Round-3 P2 R32-P2-c.
+atomic_cp_bundle() {
+  local src_dir="$1" dest_dir="$2"
+  local stage="${dest_dir}.tale-seed.$$"
+  rm -rf "$stage"
+  cp -r "$src_dir" "$stage"
+  rm -rf "$dest_dir"
+  mv "$stage" "$dest_dir"
+}
+
 run_seed() {
   log_section "Seeding builtin configs into default org (TALE_VERSION=${TALE_VERSION:-dev})"
 
@@ -386,17 +400,28 @@ run_seed() {
       local flat_slug="$(echo "$slug" | sed 's|/|__|g')"
       local history_dir="$workflows_dir/.history/$flat_slug"
 
+      # Round-3 P2 R32-P2-b: `if mkdir && atomic_cp; then echo ✓; else log_error` —
+      # the previous `mkdir && atomic_cp && echo; continue` chain silently
+      # swallowed failures under `set -e` because the `; continue` reset
+      # the implicit last-status to 0, so a disk-full / permission denied
+      # in mkdir or atomic_cp produced neither a ✓ line nor an error.
       if [ "$FORCE_SEED" = "true" ]; then
-        # `&&` (not `;`) so a failed mkdir aborts the copy attempt
-        # — otherwise atomic_cp runs against a missing dir and the
-        # diagnostic attributes the fault to the copy.
-        mkdir -p "$dest_dir" && atomic_cp "$src" "$dest" && echo "   ✓ Seeded workflow $rel_path (forced)"; continue
+        if mkdir -p "$dest_dir" && atomic_cp "$src" "$dest"; then
+          echo "   ✓ Seeded workflow $rel_path (forced)"
+        else
+          log_error "   ✗ Failed to seed workflow $rel_path (forced)"
+        fi
+        continue
       fi
       if [ -f "$dest" ]; then echo "   ⏭ Skipping workflow $rel_path (already exists)"; continue; fi
       if [ -d "$history_dir" ] && [ "$(ls -A "$history_dir" 2>/dev/null)" ]; then
         echo "   ⏭ Skipping workflow $rel_path (user has modifications in .history)"; continue
       fi
-      mkdir -p "$dest_dir" && atomic_cp "$src" "$dest" && echo "   ✓ Seeded workflow $rel_path"
+      if mkdir -p "$dest_dir" && atomic_cp "$src" "$dest"; then
+        echo "   ✓ Seeded workflow $rel_path"
+      else
+        log_error "   ✗ Failed to seed workflow $rel_path"
+      fi
     done
   fi
 
@@ -410,14 +435,13 @@ run_seed() {
       local name="$(basename "$src_dir")"
       local dest_dir="$integrations_dir/$name"
       if [ "$FORCE_SEED" = "true" ]; then
-        # rm before cp: without this, `cp -r src/ dest` nests the bundle as
-        # `dest/<name>` instead of overwriting it, leaving stale files and
-        # doubling the on-disk layout per restart.
-        rm -rf "$dest_dir"
-        cp -r "$src_dir" "$dest_dir"; echo "   ✓ Seeded integration $name (forced)"; continue
+        # atomic_cp_bundle stages + renames so an interruption can't leave
+        # a half-populated bundle that the next-run dest-existence probe
+        # would skip permanently.
+        atomic_cp_bundle "$src_dir" "$dest_dir"; echo "   ✓ Seeded integration $name (forced)"; continue
       fi
       if [ -d "$dest_dir" ]; then echo "   ⏭ Skipping integration $name (already exists)"; continue; fi
-      cp -r "$src_dir" "$dest_dir"; echo "   ✓ Seeded integration $name"
+      atomic_cp_bundle "$src_dir" "$dest_dir"; echo "   ✓ Seeded integration $name"
     done
   fi
 
@@ -431,13 +455,10 @@ run_seed() {
       local name="$(basename "$src_dir")"
       local dest_dir="$skills_dir/$name"
       if [ "$FORCE_SEED" = "true" ]; then
-        # rm before cp — same fix as the integrations seed loop above:
-        # without it, FORCE_SEED nests the bundle and leaves stale files.
-        rm -rf "$dest_dir"
-        cp -r "$src_dir" "$dest_dir"; echo "   ✓ Seeded skill $name (forced)"; continue
+        atomic_cp_bundle "$src_dir" "$dest_dir"; echo "   ✓ Seeded skill $name (forced)"; continue
       fi
       if [ -d "$dest_dir" ]; then echo "   ⏭ Skipping skill $name (already exists)"; continue; fi
-      cp -r "$src_dir" "$dest_dir"; echo "   ✓ Seeded skill $name"
+      atomic_cp_bundle "$src_dir" "$dest_dir"; echo "   ✓ Seeded skill $name"
     done
   fi
 
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 436d74f96c..332609c6ee 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -625,10 +625,14 @@ export async function deploy(options: DeployOptions): Promise<void> {
   }
 }
 
-// Org slug shape — must match validateOrgSlug at services/platform/lib/shared/constants/org-slug.ts.
-// Duplicated here because the CLI ships in a single compiled binary that does
-// not import convex sources at runtime.
-const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]*$/;
+// Org slug shape — must match ORG_SLUG_REGEX at
+// services/platform/lib/shared/constants/org-slug.ts and ORG_SLUG_RE at
+// packages/tale_shared/src/tale_shared/config/org_slug.py. The 64-char
+// cap (round-3 P1) aligns this file with the canonical validator;
+// without it, the deploy-side enumerator would accept slugs the platform
+// itself refuses to mint. Duplicated here because the CLI ships in a
+// single compiled binary that does not import convex sources at runtime.
+const ORG_SLUG_REGEX = /^[a-z0-9][a-z0-9_-]{0,63}$/;
 
 // Top-level names under the project root that are legitimate per-domain
 // dirs from the OLD flat layout (`agents/`, `workflows/`, …). Under
diff --git a/tools/cli/src/lib/actions/migrate-config-layout.ts b/tools/cli/src/lib/actions/migrate-config-layout.ts
index c45a25735b..3c28b84f46 100644
--- a/tools/cli/src/lib/actions/migrate-config-layout.ts
+++ b/tools/cli/src/lib/actions/migrate-config-layout.ts
@@ -241,7 +241,14 @@ copy_secret() {
   fi
   mkdir -p "$dst_dir"
   chown "$APP_UID:$APP_GID" "$dst_dir" 2>/dev/null || true
-  cp -a "$src" "$dst"
+  # Atomic publish via tmp + rename: a SIGINT / container crash mid-cp
+  # would otherwise leave a half-written $dst that blocks re-runs (the
+  # cmp -s above would refuse to overwrite). tmp file lives in the
+  # same dir so the mv stays on one filesystem (POSIX-atomic).
+  # Round-3 P2 R29-P2-c.
+  local tmp="$dst.tale-migrate.$$"
+  cp -a "$src" "$tmp"
+  mv -f "$tmp" "$dst"
   copied=$((copied+1))
   echo "OK: $src -> $dst"
 }
@@ -308,6 +315,21 @@ process_secret() {
 
 detect_default_dst_collisions
 
+# Round-3 P2 R29-P2-b: when the pre-scan finds dst collisions, abort
+# BEFORE running process_secret. Previously the script logged the
+# conflicts but proceeded to enumerate sources anyway; whichever
+# source process_secret happened to hit first won, and the operator's
+# end state depended on dir iteration order.
+if [ "$errors" -gt 0 ]; then
+  echo
+  echo "MIGRATE_ABORT: $errors conflict(s) detected during pre-scan; refusing to proceed." >&2
+  echo "Unresolved conflicts (require manual reconciliation):" >&2
+  for c in "\${conflicts[@]}"; do
+    echo "  - $c" >&2
+  done
+  exit 1
+fi
+
 # Default org: top-level $DATA/providers/*.secrets.json → $DATA/default/providers/
 if [ -d "$DATA/providers" ]; then
   for f in "$DATA"/providers/*.secrets.json; do
diff --git a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
index 73f0c11913..672ce4f0d7 100644
--- a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
+++ b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
@@ -26,8 +26,12 @@ const HOST_DOMAIN_DIRS = [
   'providers',
   'skills',
 ] as const;
-/** Org-slug regex aligned with the platform-side validator. Refuses dotfiles
- *  and any non-org-shaped dir at the project root (`.tale`, `.git`, etc.). */
+/** Org-slug regex aligned with services/platform/lib/shared/constants/org-slug.ts
+ *  and tools/cli/src/lib/actions/deploy.ts (round-3 P1 cap of 64 chars).
+ *  Refuses dotfiles and any non-org-shaped dir at the project root
+ *  (`.tale`, `.git`, etc.). Single source of truth lives in the platform
+ *  package; duplicated here because the CLI binary doesn't import convex
+ *  sources at runtime. */
 const ORG_SLUG_RE = /^[a-z0-9][a-z0-9_-]{0,63}$/;
 
 interface DevComposeOptions {

From 8ee1851e829d6a8bac494619dde3f270b1a0bfdb Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 02:26:09 +0800
Subject: [PATCH 32/41] =?UTF-8?q?fix(rag,crawler,platform):=20satisfy=20ch?=
 =?UTF-8?q?eck=20=E2=80=94=20lint=20+=20test=20fixups=20for=20the=20cluste?=
 =?UTF-8?q?r?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- rag_service _safe_close uses contextlib.suppress(TimeoutError) per
  ruff SIM105.
- crawler tests test_website_membership now feed fetchrow as a list
  (websites UPSERT RETURNING + membership INSERT RETURNING) since
  register_website added a second fetchrow when surfacing the stored
  scan_interval.
- platform scaffold seedRetention error-message uses JSON.stringify on
  non-Error targetStatErr to satisfy no-base-to-string.
- platform config-watcher single-file gate replaced with an
  isSingleFileOrgConfig type predicate so stem narrows without an
  unsafe assertion.
---
 .../crawler/tests/test_website_membership.py  | 31 +++++++++++++++----
 .../platform/convex/organizations/scaffold.ts |  6 ++--
 services/platform/lib/config-watcher.ts       | 21 ++++++++-----
 services/rag/app/services/rag_service.py      |  5 ++-
 4 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/services/crawler/tests/test_website_membership.py b/services/crawler/tests/test_website_membership.py
index c14214b53c..0653f8fd4a 100644
--- a/services/crawler/tests/test_website_membership.py
+++ b/services/crawler/tests/test_website_membership.py
@@ -18,11 +18,20 @@
 
 
 def _make_conn(*, fetchval_return=0, execute_return="DELETE 1", fetchrow_return=None):
-    """Build a per-test asyncpg connection stub with configurable returns."""
+    """Build a per-test asyncpg connection stub with configurable returns.
+
+    `fetchrow_return` may be a single value (returned for every fetchrow
+    call) or a list (each call pops the next entry). `register_website`
+    now does two fetchrows — the websites UPSERT (returns scan_interval +
+    status) and the membership insert (returns the `inserted` flag).
+    """
     conn = AsyncMock()
     conn.execute = AsyncMock(return_value=execute_return)
     conn.fetchval = AsyncMock(return_value=fetchval_return)
-    conn.fetchrow = AsyncMock(return_value=fetchrow_return)
+    if isinstance(fetchrow_return, list):
+        conn.fetchrow = AsyncMock(side_effect=list(fetchrow_return))
+    else:
+        conn.fetchrow = AsyncMock(return_value=fetchrow_return)
     # Transactions are no-ops at this layer; just yield the same conn.
     conn.transaction = MagicMock()
     conn.transaction.return_value.__aenter__ = AsyncMock(return_value=None)
@@ -44,7 +53,10 @@ class TestRegisterWebsite:
     async def test_first_membership_reports_first_membership_true(self):
         conn = _make_conn(
             fetchval_return=1,  # total members after insert = 1
-            fetchrow_return={"inserted": True},
+            fetchrow_return=[
+                {"scan_interval": 3600, "status": "idle"},  # websites UPSERT RETURNING
+                {"inserted": True},  # membership INSERT RETURNING
+            ],
         )
         with _patch_acquire(conn):
             manager = PgWebsiteStoreManager(pool=MagicMock())
@@ -57,7 +69,10 @@ async def test_first_membership_reports_first_membership_true(self):
     async def test_second_org_joining_does_not_report_first_membership(self):
         conn = _make_conn(
             fetchval_return=2,  # total members after insert = 2
-            fetchrow_return={"inserted": True},
+            fetchrow_return=[
+                {"scan_interval": 3600, "status": "idle"},
+                {"inserted": True},
+            ],
         )
         with _patch_acquire(conn):
             manager = PgWebsiteStoreManager(pool=MagicMock())
@@ -66,10 +81,14 @@ async def test_second_org_joining_does_not_report_first_membership(self):
         assert result["first_membership"] is False
 
     async def test_idempotent_when_same_org_re_registers(self):
-        # ON CONFLICT DO NOTHING → no RETURNING row, total stays as-is.
+        # ON CONFLICT DO NOTHING on the membership insert → no RETURNING row.
+        # The websites UPSERT still returns its stored row, so feed both.
         conn = _make_conn(
             fetchval_return=1,
-            fetchrow_return=None,
+            fetchrow_return=[
+                {"scan_interval": 3600, "status": "idle"},
+                None,
+            ],
         )
         with _patch_acquire(conn):
             manager = PgWebsiteStoreManager(pool=MagicMock())
diff --git a/services/platform/convex/organizations/scaffold.ts b/services/platform/convex/organizations/scaffold.ts
index ac197cd818..dc240140f9 100644
--- a/services/platform/convex/organizations/scaffold.ts
+++ b/services/platform/convex/organizations/scaffold.ts
@@ -437,11 +437,11 @@ async function seedRetention(
   // result so a deploy can surface it instead of producing a silent
   // clobber.
   if (!targetExists && errnoCode(targetStatErr) !== 'ENOENT' && targetStatErr) {
-    const message = `[scaffold] retention: stat ${targetFile} failed: ${
+    const errDetail =
       targetStatErr instanceof Error
         ? targetStatErr.message
-        : String(targetStatErr)
-    }`;
+        : JSON.stringify(targetStatErr);
+    const message = `[scaffold] retention: stat ${targetFile} failed: ${errDetail}`;
     console.warn(message);
     if (!override) {
       return { domain: 'retention', ok: false, error: message };
diff --git a/services/platform/lib/config-watcher.ts b/services/platform/lib/config-watcher.ts
index e92a447aa0..f6bd398e94 100644
--- a/services/platform/lib/config-watcher.ts
+++ b/services/platform/lib/config-watcher.ts
@@ -32,9 +32,16 @@ const EMIT_DEBOUNCE_MS = 100;
  * Must stay in lockstep with the read-side resolvers — adding a new
  * entry here without a matching reader means the watcher emits events
  * nothing consumes, and adding a reader without an entry here means
- * operator edits silently never invalidate caches.
+ * operator edits silently never invalidate caches. Typed as an array
+ * of literal-type members so membership lookup narrows without a cast.
  */
-const SINGLE_FILE_ORG_CONFIGS: ReadonlySet<string> = new Set(['retention']);
+type SingleFileOrgConfigStem = Extract<ConfigChangeEvent['type'], 'retention'>;
+const SINGLE_FILE_ORG_CONFIGS: ReadonlyArray<SingleFileOrgConfigStem> = [
+  'retention',
+];
+function isSingleFileOrgConfig(stem: string): stem is SingleFileOrgConfigStem {
+  return (SINGLE_FILE_ORG_CONFIGS as ReadonlyArray<string>).includes(stem);
+}
 
 /**
  * Parse a relative path within the config directory into a structured event,
@@ -78,12 +85,10 @@ function parseConfigChange(relativePath: string): ConfigChangeEvent | null {
   // R18-P2-d).
   if (parts.length === 2 && parts[1].endsWith('.json')) {
     const stem = parts[1].slice(0, -'.json'.length);
-    if (SINGLE_FILE_ORG_CONFIGS.has(stem)) {
-      return {
-        type: stem as ConfigChangeEvent['type'],
-        orgSlug,
-        slug: stem,
-      };
+    // `isSingleFileOrgConfig` is a type predicate so `stem` narrows to
+    // a literal that fits ConfigChangeEvent['type'] without a cast.
+    if (isSingleFileOrgConfig(stem)) {
+      return { type: stem, orgSlug, slug: stem };
     }
     return null;
   }
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index 63faf2be76..0c3816fc6f 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -26,6 +26,7 @@
 from __future__ import annotations
 
 import asyncio
+import contextlib
 import datetime as dt
 import time
 from collections import OrderedDict
@@ -108,10 +109,8 @@ async def _safe_close(coro) -> None:
     bounded shutdown drain would cancel the `asyncio.sleep(30)` and the
     wrapped close coroutine would never be awaited.
     """
-    try:
+    with contextlib.suppress(TimeoutError):
         await asyncio.wait_for(_get_shutdown_event().wait(), timeout=30)
-    except TimeoutError:
-        pass
     try:
         await coro
     except Exception:

From 62b1876899250967c17dc9df0ce44e95896881c6 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:43:46 +0800
Subject: [PATCH 33/41] =?UTF-8?q?fix(platform):=20round-3=20=E2=80=94=20cl?=
 =?UTF-8?q?ose=20cross-tenant=20gaps=20in=20workflows,=20doc=20get=5Fmetad?=
 =?UTF-8?q?ata,=20reserved-slug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`workflows/file_actions.ts` was the last sub-system whose public actions
took `organizationId` as a raw arg and resolved it through
`resolveOrgSlug` without first verifying caller membership; any
authenticated user could read or mutate another org's workflows by
passing that org's id. Replace the auth + resolveOrgSlug pair with a
single `requireOrgMembershipById` call on every public action, mirroring
the pattern already used in agents / threads / integrations / providers.
Same edit also tightens two adjacent hazards in the same file:
`renameWorkflow` now refuses to clobber an existing target (the old
`atomicWrite` flow silently overwrote the victim file), and
`readHistoryEntry` / `restoreFromHistory` route the path through
`safeJoinWithinDir` with explicit slug + timestamp validation so the
inline `startsWith` check can't be bypassed via a sibling dir whose name
shares the prefix.

`document_action.get_metadata` was returning `fileName` for any
caller-supplied storage id because `getByStorageId` is a global
`by_storageId` lookup with no org filter, while the sibling
`extract_docx_structured` and `apply_docx_structured` branches already
gate ownership via `verifyStorageIdsBelongToOrg`. Add the same
ownership check inline (compare `fileMetadata.organizationId ===
organizationId`, treat the mismatch as "Unknown") so foreign-org
filenames stop leaking through the workflow steps.

`auth.ts` reserved-slug bypass was too wide: when `anyOrg.length === 0`
it admitted every reserved slug (`default`, `agents`, `branding`,
`providers`, …), so a racing first-signup user on a fresh deploy could
claim e.g. `branding` before the operator created `default` and wedge
the install in the `findOrgDirs` legacy-artifact trap. Narrow the
bypass to only `default` (the one slug the platform's first-run actually
needs); all other reserved slugs are now refused unconditionally.
---
 services/platform/convex/auth.ts              |  26 ++-
 .../action_defs/document/document_action.ts   |  16 +-
 .../platform/convex/workflows/file_actions.ts | 167 ++++++++++++------
 3 files changed, 149 insertions(+), 60 deletions(-)

diff --git a/services/platform/convex/auth.ts b/services/platform/convex/auth.ts
index 92658ef6fc..2f8dbe3f92 100644
--- a/services/platform/convex/auth.ts
+++ b/services/platform/convex/auth.ts
@@ -597,13 +597,27 @@ export const getAuthOptions = (ctx: GenericCtx<DataModel>) => {
               });
             }
 
-            // Refuse reserved slugs ("default") that the platform pins
-            // global resources to (branding, retention defaults).
-            // Without this, an open-signup user could claim "default"
-            // before the platform seed runs and inherit branding-admin.
-            // Exception: the platform's own first-run seed creates
-            // `default` when no orgs exist yet — let that one through.
+            // Refuse reserved slugs ("default", "agents", "branding",
+            // "providers", "retention", "skills", "workflows",
+            // "integrations") — the platform pins on-disk and DB
+            // resources to these names. Without this, an open-signup
+            // user could claim e.g. "branding" before the platform's
+            // first-run seed runs and lock the operator out.
+            //
+            // Narrow first-run bypass: ONLY `default` is auto-claimed
+            // by the platform on first signup; the other reserved
+            // slugs have no legitimate "user wants this on a fresh
+            // deploy" path. A wider bypass (any reserved slug when
+            // anyOrg.length === 0) would let a racing first user claim
+            // e.g. `providers` before the operator creates `default`,
+            // wedging the deployment in `findOrgDirs`' legacy-artifact
+            // trap.
             if (isReservedOrgSlug(normalizedSlug)) {
+              if (normalizedSlug !== 'default') {
+                throw new APIError('BAD_REQUEST', {
+                  message: `Organization slug "${normalizedSlug}" is reserved by the platform.`,
+                });
+              }
               const anyOrg = await ctx.runQuery(
                 components.betterAuth.adapter.findMany,
                 {
diff --git a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
index 6abc07e777..a7f9df87cf 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/document_action.ts
@@ -625,6 +625,20 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
                 : Promise.resolve(undefined),
             ]);
 
+            // Cross-tenant gate: `getByStorageId` is a global `by_storageId`
+            // index lookup with no org filter, so a workflow caller can
+            // supply a foreign-org `_storage` id and read back its
+            // `fileName` unless we gate on `fileMetadata.organizationId`.
+            // The sibling branches `extract_docx_structured` /
+            // `apply_docx_structured` already enforce this via
+            // `verifyStorageIdsBelongToOrg` — mirror that gate here.
+            const ownedMetadata =
+              fileMetadata &&
+              organizationId &&
+              fileMetadata.organizationId === organizationId
+                ? fileMetadata
+                : null;
+
             // Drop the docs-row if the caller doesn't have access to its
             // team. fileMetadata + base name still surface so workflow
             // steps that only need fileName don't break — but team-
@@ -654,7 +668,7 @@ export const documentAction: ActionDefinition<DocumentActionParams> = {
 
             return {
               fileId,
-              fileName: fileMetadata?.fileName ?? 'Unknown',
+              fileName: ownedMetadata?.fileName ?? 'Unknown',
               sourceCreatedAt: visibleDocument?.sourceCreatedAt,
               sourceModifiedAt: visibleDocument?.sourceModifiedAt,
               lastModified,
diff --git a/services/platform/convex/workflows/file_actions.ts b/services/platform/convex/workflows/file_actions.ts
index 472b9df554..4cfe3f4b12 100644
--- a/services/platform/convex/workflows/file_actions.ts
+++ b/services/platform/convex/workflows/file_actions.ts
@@ -12,25 +12,26 @@
 import { mkdir, readdir, rm, stat, unlink } from 'node:fs/promises';
 import path from 'node:path';
 
-import { v } from 'convex/values';
+import { ConvexError, v } from 'convex/values';
 
 import type { WorkflowJsonConfig } from '../../lib/shared/schemas/workflows';
 import { workflowJsonSchema } from '../../lib/shared/schemas/workflows';
 import { internal } from '../_generated/api';
 import { action, internalAction } from '../_generated/server';
-import { authComponent } from '../auth';
+import { requireOrgMembershipById } from '../lib/auth/require_org_membership';
 import {
   atomicWrite,
+  errnoCode,
   generateHistoryTimestamp,
   handleDirReadError,
   pruneHistory,
   readFileSafe,
   readJsonFile,
   readdirSafe,
+  safeJoinWithinDir,
   sha256,
   verifyPathWithinBase,
 } from '../lib/file_io';
-import { resolveOrgSlug } from '../organizations/resolve_org_slug';
 import type { WorkflowReadResult } from './file_utils';
 import {
   MAX_FILE_SIZE_BYTES,
@@ -44,6 +45,16 @@ import {
   workflowSlugFromRelativePath,
 } from './file_utils';
 
+// History filenames are `${Date.now()}-${randomUUID().slice(0,8)}` — see
+// `lib/file_io.ts::generateHistoryTimestamp`. Restrict to that shape so
+// `restoreFromHistory` / `readHistoryEntry` reject anything that could probe
+// outside the per-workflow history dir even before `safeJoinWithinDir` fires.
+const HISTORY_TIMESTAMP_REGEX = /^\d{10,16}-[a-f0-9]{6,16}$/;
+
+function validateHistoryTimestamp(timestamp: string): boolean {
+  return HISTORY_TIMESTAMP_REGEX.test(timestamp);
+}
+
 // ---------------------------------------------------------------------------
 // Internal helpers
 // ---------------------------------------------------------------------------
@@ -162,9 +173,10 @@ export const readWorkflow = action({
   },
   returns: v.any(),
   handler: async (ctx, args): Promise<WorkflowReadResult> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
     return readWorkflowFile(orgSlug, args.workflowSlug);
   },
 });
@@ -179,10 +191,10 @@ export const listWorkflows = action({
   returns: v.any(),
   // oxlint-disable-next-line typescript/no-explicit-any -- listWorkflows returns heterogeneous shapes; v.any() at API boundary
   handler: async (ctx, args): Promise<any[]> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
     const filterMode = args.filter ?? 'all';
     const dir = resolveWorkflowsDir(orgSlug);
     let entries: { name: string; parentPath: string; isDirectory: boolean }[];
@@ -273,14 +285,15 @@ export const saveWorkflowWithSnapshot = action({
   },
   returns: v.object({ hash: v.string() }),
   handler: async (ctx, args): Promise<{ hash: string }> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
     if (!validateWorkflowSlug(args.workflowSlug)) {
       throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
     }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const config = workflowJsonSchema.parse(args.config);
     const newContent = serializeWorkflowJson(config);
     const filePath = resolveWorkflowFilePath(orgSlug, args.workflowSlug);
@@ -320,14 +333,15 @@ export const deleteWorkflow = action({
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
     if (!validateWorkflowSlug(args.workflowSlug)) {
       throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
     }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const filePath = resolveWorkflowFilePath(orgSlug, args.workflowSlug);
     const historyDir = resolveHistoryDir(orgSlug, args.workflowSlug);
 
@@ -354,14 +368,15 @@ export const installWorkflow = action({
   },
   returns: v.object({ hash: v.string() }),
   handler: async (ctx, args): Promise<{ hash: string }> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug, userId, email } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
     if (!validateWorkflowSlug(args.workflowSlug)) {
       throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
     }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const result = await readWorkflowFile(orgSlug, args.workflowSlug);
     if (!result.ok) {
       throw new Error(`Cannot install workflow: ${result.message}`);
@@ -370,7 +385,7 @@ export const installWorkflow = action({
     await ctx.runMutation(internal.workflows.installations.upsertInstallation, {
       organizationId: args.organizationId,
       workflowSlug: args.workflowSlug,
-      installedBy: authUser.email ?? String(authUser._id),
+      installedBy: email !== '' ? email : userId,
       contentHash: result.hash,
     });
 
@@ -385,8 +400,7 @@ export const uninstallWorkflow = action({
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    await requireOrgMembershipById(ctx, args.organizationId);
 
     if (!validateWorkflowSlug(args.workflowSlug)) {
       throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
@@ -408,10 +422,11 @@ export const duplicateWorkflow = action({
   },
   returns: v.object({ newSlug: v.string() }),
   handler: async (ctx, args): Promise<{ newSlug: string }> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug, userId, email } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const source = await readWorkflowFile(orgSlug, args.workflowSlug);
     if (!source.ok) {
       throw new Error(`Cannot duplicate: ${source.message}`);
@@ -450,7 +465,7 @@ export const duplicateWorkflow = action({
     await ctx.runMutation(internal.workflows.installations.upsertInstallation, {
       organizationId: args.organizationId,
       workflowSlug: newSlug,
-      installedBy: authUser.email ?? String(authUser._id),
+      installedBy: email !== '' ? email : userId,
       contentHash: sha256(content),
     });
 
@@ -466,8 +481,10 @@ export const renameWorkflow = action({
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
     if (!validateWorkflowSlug(args.oldSlug)) {
       throw new Error(`Invalid old slug: ${args.oldSlug}`);
@@ -476,7 +493,6 @@ export const renameWorkflow = action({
       throw new Error(`Invalid new slug: ${args.newSlug}`);
     }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const oldPath = resolveWorkflowFilePath(orgSlug, args.oldSlug);
     const newPath = resolveWorkflowFilePath(orgSlug, args.newSlug);
     const baseDir = resolveWorkflowsDir(orgSlug);
@@ -488,6 +504,21 @@ export const renameWorkflow = action({
     if (!content) throw new Error('Workflow not found');
     parseWorkflowJson(content);
 
+    // Refuse to clobber an existing target. `atomicWrite` resolves to a
+    // rename-from-temp under the hood, which silently overwrites — without
+    // this guard, two concurrent renames or a typo could destroy the target
+    // workflow's content with no way to recover.
+    const targetExists = await stat(newPath).then(
+      () => true,
+      (err) => {
+        if (errnoCode(err) === 'ENOENT') return false;
+        throw err;
+      },
+    );
+    if (targetExists) {
+      throw new Error(`Target workflow already exists: ${args.newSlug}`);
+    }
+
     await mkdir(path.dirname(newPath), { recursive: true });
     await atomicWrite(newPath, content);
     await unlink(oldPath);
@@ -539,10 +570,15 @@ export const listHistory = action({
   },
   returns: v.any(),
   handler: async (ctx, args) => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
+
+    if (!validateWorkflowSlug(args.workflowSlug)) {
+      throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
+    }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const historyDir = resolveHistoryDir(orgSlug, args.workflowSlug);
     const entries = await readdirSafe(historyDir);
 
@@ -568,18 +604,21 @@ export const readHistoryEntry = action({
   },
   returns: v.any(),
   handler: async (ctx, args) => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
-
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
-    const historyDir = resolveHistoryDir(orgSlug, args.workflowSlug);
-    const filePath = path.join(historyDir, `${args.timestamp}.json`);
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
 
-    const resolved = path.resolve(filePath);
-    if (!resolved.startsWith(path.resolve(historyDir))) {
-      throw new Error('Path traversal detected');
+    if (!validateWorkflowSlug(args.workflowSlug)) {
+      throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
+    }
+    if (!validateHistoryTimestamp(args.timestamp)) {
+      throw new Error(`Invalid history timestamp: ${args.timestamp}`);
     }
 
+    const historyDir = resolveHistoryDir(orgSlug, args.workflowSlug);
+    const filePath = safeJoinWithinDir(historyDir, `${args.timestamp}.json`);
+
     const content = await readFileSafe(filePath);
     if (!content) {
       return {
@@ -606,19 +645,22 @@ export const restoreFromHistory = action({
   },
   returns: v.object({ hash: v.string() }),
   handler: async (ctx, args): Promise<{ hash: string }> => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) throw new Error('Unauthenticated');
+    const { orgSlug } = await requireOrgMembershipById(
+      ctx,
+      args.organizationId,
+    );
+
+    if (!validateWorkflowSlug(args.workflowSlug)) {
+      throw new Error(`Invalid workflow slug: ${args.workflowSlug}`);
+    }
+    if (!validateHistoryTimestamp(args.timestamp)) {
+      throw new Error(`Invalid history timestamp: ${args.timestamp}`);
+    }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const historyDir = resolveHistoryDir(orgSlug, args.workflowSlug);
-    const historyPath = path.join(historyDir, `${args.timestamp}.json`);
+    const historyPath = safeJoinWithinDir(historyDir, `${args.timestamp}.json`);
     const workflowPath = resolveWorkflowFilePath(orgSlug, args.workflowSlug);
 
-    const resolved = path.resolve(historyPath);
-    if (!resolved.startsWith(path.resolve(historyDir))) {
-      throw new Error('Path traversal detected');
-    }
-
     const historyContent = await readFileSafe(historyPath);
     if (!historyContent) throw new Error('History entry not found');
     parseWorkflowJson(historyContent);
@@ -765,10 +807,29 @@ export const getAvailableWorkflows = action({
     }),
   ),
   handler: async (ctx, args) => {
-    const authUser = await authComponent.getAuthUser(ctx);
-    if (!authUser) return [];
+    // This action populates UI choices; non-members are a normal case (org
+    // switched away, just-joined) and should see an empty list rather than an
+    // error toast. Catch the auth `ConvexError` codes here and return [], but
+    // let unexpected errors propagate.
+    let orgSlug: string;
+    try {
+      ({ orgSlug } = await requireOrgMembershipById(ctx, args.organizationId));
+    } catch (err) {
+      if (err instanceof ConvexError) {
+        // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError.data is typed `any` upstream; we only read an optional `code` string
+        const data = err.data as { code?: string } | undefined;
+        const code = data?.code;
+        if (
+          code === 'UNAUTHENTICATED' ||
+          code === 'ORG_NOT_FOUND' ||
+          code === 'ORG_FORBIDDEN'
+        ) {
+          return [];
+        }
+      }
+      throw err;
+    }
 
-    const orgSlug = await resolveOrgSlug(ctx, args.organizationId);
     const dir = resolveWorkflowsDir(orgSlug);
     let raw;
     try {

From 569dcebafe44b984510414100be75c62cf726bf6 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:44:08 +0800
Subject: [PATCH 34/41] =?UTF-8?q?refactor(platform):=20round-3=20=E2=80=94?=
 =?UTF-8?q?=20typed=20orgSlugUnresolvable=20+=20RAG=20strip=20centralizati?=
 =?UTF-8?q?on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two cross-cutting cleanups shipped together because the agent-tool RAG
files end up touching both:

(1) `lib/helpers/org_slug.ts` gains an `OrgSlugUnresolvableError` typed
    error and an `orgSlugFromIdOrNull` variant. Round-3 review found
    that ~29 of the 32 `orgSlugFromId` call sites either didn't catch
    the throw at all or caught it inside a try-block designed for some
    other error class, so a permanent slug miss (deleted-org race,
    replica skew) cross-contaminated unrelated work — abort of GDPR
    cascades, mis-classification of transcripts as failed, etc. The
    new helper preserves the throwing variant for security gates that
    must fail loud (workflow steps, agent-facing tool errors) and
    introduces a `*OrNull` form for the cascade / cleanup / multi-org
    batch sites where a missing slug is recoverable.

    Subsequent commits migrate the high-impact call sites.

(2) `format_search_results.ts` now strips reserved prompt tags inside
    the helper for every prompt-bound field (`content`, `filename`,
    `file_id`) at the single chokepoint, so a future caller can't
    forget the strip. Round-3 review found that 4 separate call sites
    were each stripping `r.content` independently and `query_rag_context`
    (the highest-volume chat auto-context path) was stripping nothing
    at all — a prompt-injection regression waiting to happen.

`rag_search_tool.ts` retrieve and search branches:
  - Use `orgSlugFromIdOrNull` and return the safe-summary shape
    ("Knowledge base temporarily unavailable.") instead of letting the
    raw `[orgSlugFromId] organization "..." has no slug` text bubble
    into the agent loop and onward to the UI toast.
  - Wrap the retrieve branch's network + parsing in try/catch.
  - Drop the now-redundant local `stripReservedPromptTags(r.content)`.
  - Non-`UpstreamHttpError` catches return the safe summary instead of
    re-throwing.

`query_rag_context.ts` inner catch now re-throws `UpstreamHttpError` so
the 4xx "auth misconfigured" signal escapes the graceful-degrade layer
rather than being collapsed into the same `undefined` return as a 5xx
RAG outage.

`rag_action.ts` (workflow) keeps its explicit per-call strip but
extends coverage to `result.title` (chunks path) and adds a recursive
`sanitizeMetadataStrings` over the search-result `metadata` field so
indexed-chunk metadata can't bypass SEC1 either.
---
 .../agent_tools/rag/format_search_results.ts  | 30 ++++++-
 .../agent_tools/rag/query_rag_context.ts      | 15 +++-
 .../convex/agent_tools/rag/rag_search_tool.ts | 89 ++++++++++++++-----
 .../platform/convex/lib/helpers/org_slug.ts   | 74 +++++++++++++--
 .../action_defs/rag/rag_action.ts             | 50 ++++++++++-
 5 files changed, 223 insertions(+), 35 deletions(-)

diff --git a/services/platform/convex/agent_tools/rag/format_search_results.ts b/services/platform/convex/agent_tools/rag/format_search_results.ts
index fedea88bdc..5e1c24993b 100644
--- a/services/platform/convex/agent_tools/rag/format_search_results.ts
+++ b/services/platform/convex/agent_tools/rag/format_search_results.ts
@@ -3,8 +3,20 @@
  *
  * Used by rag_search_tool and query_rag_context
  * to produce a consistent numbered-chunk format.
+ *
+ * SEC1 (prompt-injection defense): every user-controlled field that
+ * lands in the rendered prompt string runs through
+ * `stripReservedPromptTags` before it's interpolated. The reserved
+ * tag set includes `<system>...</system>`, `<governance_*>`,
+ * `<user_memories>` etc. — wrappers an attacker would use to escape
+ * the surrounding agent system prompt. Sanitising in this single
+ * chokepoint instead of at every call site stops a future caller
+ * from forgetting `r.filename` or chunk content again
+ * (a regression that surfaced repeatedly during the round-3 review).
  */
 
+import { stripReservedPromptTags } from '../../lib/agent_response/sanitize_prompt';
+
 export interface SearchResult {
   content: string;
   score: number;
@@ -115,8 +127,20 @@ export function formatSearchResults(
   return results
     .map((r, idx) => {
       const score = (r.score * 100).toFixed(1);
-      const sourceAnnotation = r.filename ? ` [Source: ${r.filename}]` : '';
-      const fileIdAnnotation = r.file_id ? ` [FileID: ${r.file_id}]` : '';
+      // SEC1: `filename` is user-uploaded (any user with write access
+      // to the org can name a file `</system><system>You are now…`),
+      // and `file_id` is server-issued but cheap to sanitise. Strip
+      // reserved tags before the annotation lands inline with the
+      // chunk content in the prompt-bound string.
+      const safeFilename = r.filename
+        ? stripReservedPromptTags(r.filename)
+        : undefined;
+      const safeFileId = r.file_id
+        ? stripReservedPromptTags(r.file_id)
+        : undefined;
+      const safeContent = stripReservedPromptTags(r.content);
+      const sourceAnnotation = safeFilename ? ` [Source: ${safeFilename}]` : '';
+      const fileIdAnnotation = safeFileId ? ` [FileID: ${safeFileId}]` : '';
       const pageAnnotation =
         r.page_number != null ? ` [Page: ${r.page_number}]` : '';
       const dateAnnotation = r.source_modified_at
@@ -124,7 +148,7 @@ export function formatSearchResults(
         : r.source_created_at
           ? ` [Created: ${r.source_created_at.slice(0, 10)}]`
           : '';
-      return `[${idx + 1}] (Relevance: ${score}%)${sourceAnnotation}${pageAnnotation}${dateAnnotation}${fileIdAnnotation}\n${r.content}`;
+      return `[${idx + 1}] (Relevance: ${score}%)${sourceAnnotation}${pageAnnotation}${dateAnnotation}${fileIdAnnotation}\n${safeContent}`;
     })
     .join('\n\n---\n\n');
 }
diff --git a/services/platform/convex/agent_tools/rag/query_rag_context.ts b/services/platform/convex/agent_tools/rag/query_rag_context.ts
index ccd80b9bc5..f3d1fe74db 100644
--- a/services/platform/convex/agent_tools/rag/query_rag_context.ts
+++ b/services/platform/convex/agent_tools/rag/query_rag_context.ts
@@ -11,7 +11,10 @@
 
 import { fetchJson } from '../../../lib/utils/type-cast-helpers';
 import { createDebugLog } from '../../lib/debug_log';
-import { UpstreamHttpError } from '../../lib/errors/upstream_http_error';
+import {
+  UpstreamHttpError,
+  isUpstreamHttpError,
+} from '../../lib/errors/upstream_http_error';
 import { getRagConfig, ragFetch } from '../../lib/helpers/rag_config';
 import {
   extractCitationsFromSearchResults,
@@ -281,6 +284,16 @@ export async function queryRagContext(
     } catch (fetchError) {
       clearTimeout(timeoutId);
 
+      // Caller/config bugs (4xx → `UpstreamHttpError`) MUST propagate
+      // past this graceful-degrade layer. Otherwise the explicit throw
+      // for missing `X-Tale-Org` / bad query in the `!response.ok`
+      // branch above is silently swallowed here and the agent treats
+      // "auth misconfigured" as "knowledge base is empty" — the very
+      // failure mode the upstream throw was added to prevent.
+      if (isUpstreamHttpError(fetchError)) {
+        throw fetchError;
+      }
+
       // Handle timeout specifically
       if (fetchError instanceof Error && fetchError.name === 'AbortError') {
         console.error('[rag_query] RAG service request timeout', {
diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
index e427d576e8..20fd1deaf3 100644
--- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts
+++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts
@@ -25,7 +25,7 @@ import {
   isUpstreamHttpError,
   UpstreamHttpError,
 } from '../../lib/errors/upstream_http_error';
-import { orgSlugFromId } from '../../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../../lib/helpers/org_slug';
 import { ragFetch } from '../../lib/helpers/rag_config';
 import { toId } from '../../lib/type_cast_helpers';
 import { wrapUntrusted } from '../../lib/untrusted_content';
@@ -280,11 +280,43 @@ RESPONSE (list_indexed):
           chunkEnd: end,
         });
 
-        const retrieveOrgSlug = await orgSlugFromId(ctx, orgIdRetrieve);
-        const response = await ragFetch(
-          `/api/v1/documents/${encodeURIComponent(args.fileId)}/content?return_chunks=true&chunk_start=${start}&chunk_end=${end}`,
-          { orgSlug: retrieveOrgSlug },
-        );
+        // Org-slug miss is terminal — surface as a safe summary, not
+        // a tool-runtime throw (which would propagate to the agent
+        // loop as an opaque error). Same shape as the search branch
+        // catch below.
+        const retrieveOrgSlug = await orgSlugFromIdOrNull(ctx, orgIdRetrieve);
+        if (retrieveOrgSlug === null) {
+          return {
+            success: false,
+            response: 'Knowledge base temporarily unavailable.',
+          };
+        }
+
+        let response: Response;
+        try {
+          response = await ragFetch(
+            `/api/v1/documents/${encodeURIComponent(args.fileId)}/content?return_chunks=true&chunk_start=${start}&chunk_end=${end}`,
+            { orgSlug: retrieveOrgSlug },
+          );
+        } catch (fetchError) {
+          // ragFetch SSRF guard, abort, network, etc. — agent-facing
+          // tool path returns a safe summary instead of throwing so
+          // the agent loop can recover with a user-visible "not
+          // reachable" message.
+          if (isUpstreamHttpError(fetchError)) {
+            return { success: false, response: fetchError.safeMessage };
+          }
+          console.error('[tool:rag_search retrieve] fetch error', {
+            error:
+              fetchError instanceof Error
+                ? fetchError.message
+                : String(fetchError),
+          });
+          return {
+            success: false,
+            response: 'Knowledge base temporarily unavailable.',
+          };
+        }
 
         if (!response.ok) {
           const errorText = await response.text().catch(() => '');
@@ -451,7 +483,17 @@ RESPONSE (list_indexed):
         if (!orgIdForSearch) {
           throw new Error('rag_search requires organizationId in ToolCtx.');
         }
-        const searchOrgSlug = await orgSlugFromId(ctx, orgIdForSearch);
+        // OrNull so a deleted-org / replica-skew slug miss returns the
+        // agent-friendly safe summary (caught below) rather than the
+        // raw `[orgSlugFromId] organization "..." has no slug` text
+        // bubbling to the model context.
+        const searchOrgSlug = await orgSlugFromIdOrNull(ctx, orgIdForSearch);
+        if (searchOrgSlug === null) {
+          return {
+            success: false,
+            response: 'Knowledge base temporarily unavailable.',
+          };
+        }
         const response = await ragFetch('/api/v1/search', {
           method: 'POST',
           headers: { 'Content-Type': 'application/json' },
@@ -459,6 +501,7 @@ RESPONSE (list_indexed):
           timeoutMs: SEARCH_TIMEOUT_MS,
           orgSlug: searchOrgSlug,
         });
+        // searchOrgSlug guaranteed non-null past the OrNull guard above
 
         if (!response.ok) {
           const errorText = await response.text();
@@ -472,16 +515,13 @@ RESPONSE (list_indexed):
 
         const result = await fetchJson<SearchResponse>(response);
 
-        // SEC1: project-attached docs (and any user-uploaded RAG file) may
-        // contain `<system>…</system>` or other reserved wrapper tags an
-        // attacker crafted to escape the agent's surrounding system prompt.
-        // Strip reserved patterns from every hit's content before further
-        // wrapping/formatting. Pure removal (no XML escape) so legitimate
-        // code blocks / JSON / HTML examples in trusted docs are unaffected.
-        const sanitizedResults: SearchResult[] = result.results.map((r) => ({
-          ...r,
-          content: stripReservedPromptTags(r.content),
-        }));
+        // SEC1 (prompt-injection defense): `<system>…</system>` and the
+        // other reserved-wrapper tag stripping happens inside
+        // `formatSearchResults` itself now (single chokepoint covers
+        // both `content` and `filename`/`file_id` annotations). Keeping
+        // the strip there means a future caller can't forget it.
+        // Video-link wrapping below still uses the raw content because
+        // the strip runs on the wrapped string at format time.
 
         // Prompt-injection defense: per-hit wrap for any result that maps
         // back to a video-link transcript. The RAG service returns chunk
@@ -489,12 +529,14 @@ RESPONSE (list_indexed):
         // wrapping it lands in the agent context outside the TRUST RULES
         // system prompt's reach. Batch-query Convex once with all hit
         // file_ids; non-video-link hits are unchanged.
-        const candidateIds = sanitizedResults
+        const candidateIds = result.results
           .map((r) => r.file_id)
           .filter(
             (id): id is string => typeof id === 'string' && id.length > 0,
           );
-        const wrappedResults: SearchResult[] = sanitizedResults;
+        const wrappedResults: SearchResult[] = result.results.map((r) => ({
+          ...r,
+        }));
         if (candidateIds.length > 0) {
           const videoSourcesSearch = await ctx.runQuery(
             internal.file_metadata.internal_queries.lookupVideoLinkSources,
@@ -595,11 +637,18 @@ RESPONSE (list_indexed):
           });
           return { success: false, response: error.safeMessage };
         }
+        // Non-upstream errors (transport, parse, scope-lookup throws,
+        // etc.) used to rethrow here, surfacing raw internal messages
+        // to the agent loop. Return a neutral safe summary instead;
+        // engineer triage still has the full error in the log line.
         console.error('[tool:rag_search] error', {
           query: args.query,
           error: error instanceof Error ? error.message : String(error),
         });
-        throw error;
+        return {
+          success: false,
+          response: 'Knowledge base temporarily unavailable.',
+        };
       }
     },
   }),
diff --git a/services/platform/convex/lib/helpers/org_slug.ts b/services/platform/convex/lib/helpers/org_slug.ts
index 22f679c7fd..e99909295b 100644
--- a/services/platform/convex/lib/helpers/org_slug.ts
+++ b/services/platform/convex/lib/helpers/org_slug.ts
@@ -19,6 +19,33 @@ type CtxWithRunQuery = {
   runQuery: (...args: any[]) => Promise<unknown>;
 };
 
+/**
+ * Terminal lookup failure: the org row was not found, or exists but has
+ * no `slug` field. Both conditions are permanent — retrying will not
+ * succeed, so callers (`orgSlugFromIdOrNull`, retry-on-throw layers)
+ * should treat this distinctly from transient transport errors.
+ */
+export class OrgSlugUnresolvableError extends Error {
+  override readonly name = 'OrgSlugUnresolvableError';
+
+  constructor(
+    readonly organizationId: string,
+    readonly reason: 'no_row' | 'no_slug',
+  ) {
+    super(
+      reason === 'no_row'
+        ? `[orgSlugFromId] no organization row found for id ${JSON.stringify(organizationId)}`
+        : `[orgSlugFromId] organization ${JSON.stringify(organizationId)} has no slug`,
+    );
+  }
+}
+
+export function isOrgSlugUnresolvable(
+  err: unknown,
+): err is OrgSlugUnresolvableError {
+  return err instanceof OrgSlugUnresolvableError;
+}
+
 /**
  * Resolve an organizationId to its slug via Better Auth.
  *
@@ -35,7 +62,11 @@ type CtxWithRunQuery = {
  * org B's slug, then use it as the `X-Tale-Org` header on a downstream
  * RAG/crawler call.
  *
- * Throws if no matching org row exists, or if the row has no slug.
+ * Throws `OrgSlugUnresolvableError` when the row is missing or has no
+ * slug; transport errors (Better Auth adapter failure, network blip)
+ * propagate as themselves. Callers that want to fold the terminal-miss
+ * case into a `null` result (cascade cleanup, governance, multi-org
+ * status batches) should use `orgSlugFromIdOrNull`.
  */
 export async function orgSlugFromId(
   ctx: CtxWithRunQuery,
@@ -46,15 +77,44 @@ export async function orgSlugFromId(
     where: [{ field: '_id', value: organizationId, operator: 'eq' }],
   });
   if (!isRecord(row)) {
-    throw new Error(
-      `[orgSlugFromId] no organization row found for id ${JSON.stringify(organizationId)}`,
-    );
+    throw new OrgSlugUnresolvableError(organizationId, 'no_row');
   }
   const slug = getString(row, 'slug');
   if (!slug) {
-    throw new Error(
-      `[orgSlugFromId] organization ${JSON.stringify(organizationId)} has no slug`,
-    );
+    throw new OrgSlugUnresolvableError(organizationId, 'no_slug');
   }
   return slug;
 }
+
+/**
+ * Variant of `orgSlugFromId` that returns `null` on terminal lookup
+ * failure (row missing, no slug field) instead of throwing. Transient
+ * errors (transport, adapter exceptions) still propagate.
+ *
+ * Use this from callers where "the org is gone" is a recoverable state
+ * — typically anything that runs after the org may have been deleted:
+ *
+ *   - Background cascade cleanup (`threads/cascade_helpers.ts`)
+ *   - GDPR subject erasure cascade (`governance/erasure.ts`)
+ *   - Retention sweeps (`governance/retention_cleanup.ts`)
+ *   - Multi-org status batches that should not abort on one bad org
+ *     (`file_metadata/actions.ts`)
+ *   - Polling/retry actions where a missing slug should stop the
+ *     retry loop rather than reschedule indefinitely
+ *     (`documents/internal_actions.ts::deleteDocumentFromRag`)
+ *
+ * Callers that NEED the slug (agent tools, user-initiated reads/writes
+ * that must reach RAG/crawler with the X-Tale-Org header) should keep
+ * using `orgSlugFromId` so the throw bubbles up to a user-facing error.
+ */
+export async function orgSlugFromIdOrNull(
+  ctx: CtxWithRunQuery,
+  organizationId: string,
+): Promise<string | null> {
+  try {
+    return await orgSlugFromId(ctx, organizationId);
+  } catch (err) {
+    if (isOrgSlugUnresolvable(err)) return null;
+    throw err;
+  }
+}
diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
index 448f35e7bd..eba5a279ab 100644
--- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts
@@ -18,6 +18,33 @@ import { uploadDocument } from './helpers/upload_document';
 
 const SEARCH_TIMEOUT_MS = 30_000;
 
+/**
+ * Recursively run `stripReservedPromptTags` over every string leaf of
+ * a search-result `metadata` payload. Non-string values are passed
+ * through unchanged. Used to strip prompt-injection vectors from
+ * indexed-chunk metadata (titles, headings, captions, etc.) before
+ * the workflow step returns the result to downstream templates.
+ */
+function sanitizeMetadataStrings(
+  value: Record<string, unknown>,
+): Record<string, unknown> {
+  const out: Record<string, unknown> = {};
+  for (const [key, val] of Object.entries(value)) {
+    out[key] = sanitizeMetadataLeaf(val);
+  }
+  return out;
+}
+
+function sanitizeMetadataLeaf(value: unknown): unknown {
+  if (typeof value === 'string') return stripReservedPromptTags(value);
+  if (Array.isArray(value)) return value.map(sanitizeMetadataLeaf);
+  if (value && typeof value === 'object') {
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- runtime guard above narrows to object; metadata is a free-form JSON record from RAG
+    return sanitizeMetadataStrings(value as Record<string, unknown>);
+  }
+  return value;
+}
+
 export const ragAction: ActionDefinition<RagActionParams> = {
   type: 'rag',
   title: 'RAG Document Manager',
@@ -101,10 +128,16 @@ export const ragAction: ActionDefinition<RagActionParams> = {
         // workflow's downstream system prompt. Strip BEFORE any further
         // wrapping (the video-link `wrapUntrusted` then layers on top).
         // Mirrors `rag_search_tool.ts:319` (agent-tool retrieve path).
+        // Also strip the document `title` — it's the user-uploaded
+        // filename and flows into downstream template renderings the
+        // same way `r.content` does.
         result.chunks = result.chunks.map((c) => ({
           ...c,
           content: stripReservedPromptTags(c.content),
         }));
+        if (result.title) {
+          result.title = stripReservedPromptTags(result.title);
+        }
         // Prompt-injection defense: video-link-sourced chunks contain
         // attacker-controlled transcript text. Mirror the wrap that
         // `rag_search_tool.ts` applies on the agent-tool side.
@@ -162,13 +195,22 @@ export const ragAction: ActionDefinition<RagActionParams> = {
           }
 
           const result = await fetchJson<SearchResponse>(response);
-          // SEC1: strip reserved wrapper tags from every search hit
-          // BEFORE further processing. Mirrors `rag_search_tool.ts:483`
-          // (agent-tool search path). The subsequent video-link
-          // `wrapUntrusted` layers on top of the stripped content.
+          // SEC1: strip reserved wrapper tags from every prompt-bound
+          // field on each search hit. `content` is the obvious one;
+          // `filename` is user-uploaded (any user with write access
+          // can name a file `</system><system>…`) and `metadata`
+          // string values come back from the indexed-chunk payload —
+          // both end up in downstream workflow templates the same way
+          // `content` does, so all three need the same defense.
           let wrappedResults = result.results.map((r) => ({
             ...r,
             content: stripReservedPromptTags(r.content),
+            ...(r.filename
+              ? { filename: stripReservedPromptTags(r.filename) }
+              : {}),
+            ...(r.metadata
+              ? { metadata: sanitizeMetadataStrings(r.metadata) }
+              : {}),
           }));
           if (wrappedResults.length > 0) {
             const fileIds = wrappedResults

From d2d74a60342021f55127dfef1b3a295a0c92e375 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:45:21 +0800
Subject: [PATCH 35/41] =?UTF-8?q?fix(platform):=20round-3=20=E2=80=94=20mi?=
 =?UTF-8?q?grate=20orgSlugFromId=20callers=20+=20upload-then-delete=20rein?=
 =?UTF-8?q?dex?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continuation of the helper refactor in the prior commit: migrate the
seven high-impact call sites that round-3 review flagged for the
"`orgSlugFromId` throw cross-contaminates unrelated work" pattern,
plus three same-file hardenings that landed alongside.

threads/cascade_helpers.ts:
  Permanent slug miss (org row deleted, missing slug) used to return
  `{done:false, remaining:1}` forever; the retention sweep then burned
  its MAX_ATTEMPTS budget each cycle and gave up, accumulating orphan
  `fileMetadata` rows + `_storage` blobs indefinitely. Now uses OrNull,
  cleans local rows + storage even when slug is gone, and only skips
  the RAG-side purge (the tenant index is gone too). Also hoist the
  slug lookup inside the `filesPage.length > 0` branch — empty pages
  no longer pay for an unnecessary Better Auth round-trip.

governance/erasure.ts:
  GDPR fan-out now degrades when slug is unresolvable instead of
  aborting; DB-side cascade continues. `subjectIsMemberOfOtherActiveOrgs`
  256-cap silent fail-open is fixed: at the cap we now warn + return
  true (fail-closed) so an operator account with many memberships
  doesn't accidentally trip a global throttle / 2FA wipe.

governance/retention_cleanup.ts:
  Empty-batch fast path before the slug lookup; OrNull so a missing
  slug skips the RAG DELETE step but the local document/temp-file
  deletes still proceed.

file_metadata/actions.ts:
  Cross-org failure traffic: a single org's RAG outage marked every
  other org's in-flight uploads as `failed` after 90 s. Track which
  orgs queried successfully and only run `expireStaleRagQueue` against
  their storage ids; an unresolvable org skips the bucket entirely.

file_metadata/internal_actions.ts:
  Resolve slug OUTSIDE the try block in `extractFileMetadata` so a
  slug miss doesn't get reclassified as a "permanent" failure and
  stamp `visionRequired:false` against an otherwise-healthy upload.

file_metadata/transcribe_audio.ts:
  Resolve slug ONCE at the top of the action; reuse on both the
  cache-path and post-Whisper RAG index sites. Previously each call
  re-queried Better Auth, and a transient adapter failure on the
  second lookup bubbled into the outer catch and re-queued a fresh
  Whisper call against already-completed work.

documents/internal_actions.ts:
  Three sites converted to OrNull: `checkRagDocumentStatus` (mark
  failed once instead of looping retries forever), `deleteDocumentFromRag`
  (proceed with the local DB delete if the org is gone — RAG index
  is gone too), and `reindexDocumentInRag` (now invokes the new
  `deleteOldRagEntry` helper). Also swap the reindex from delete-first
  to upload-then-delete: a failed upload now leaves the OLD RAG entry
  intact so search keeps returning the prior revision, instead of
  marking ragInfo.failed with no entries at all.
---
 .../convex/documents/internal_actions.ts      | 158 ++++++++++++++----
 .../platform/convex/file_metadata/actions.ts  |  30 +++-
 .../convex/file_metadata/internal_actions.ts  |  18 +-
 .../convex/file_metadata/transcribe_audio.ts  |  60 ++++---
 .../platform/convex/governance/erasure.ts     | 126 ++++++++------
 .../convex/governance/retention_cleanup.ts    |  25 ++-
 .../convex/threads/cascade_helpers.ts         |  88 +++++-----
 7 files changed, 349 insertions(+), 156 deletions(-)

diff --git a/services/platform/convex/documents/internal_actions.ts b/services/platform/convex/documents/internal_actions.ts
index cfe92b2a16..4b4229e28a 100644
--- a/services/platform/convex/documents/internal_actions.ts
+++ b/services/platform/convex/documents/internal_actions.ts
@@ -7,7 +7,7 @@ import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import { internalAction } from '../_generated/server';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { buildDownloadUrl } from '../lib/helpers/public_storage_url';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
@@ -24,6 +24,44 @@ function parseIsoTimestampMs(iso: string | undefined): number | undefined {
   return Number.isFinite(ms) ? ms : undefined;
 }
 
+/**
+ * Best-effort RAG DELETE for a stale fileId during re-index. Logs and
+ * returns; never throws. Used by `reindexDocumentInRag` after the new
+ * upload succeeds — a failure to delete the old entry leaves orphan
+ * chunks but does not regress the user-visible reindex result.
+ */
+async function deleteOldRagEntry(
+  // oxlint-disable-next-line typescript/no-explicit-any -- ActionCtx is heavy to pull in just for runQuery shape; orgSlugFromIdOrNull accepts a structural ctx
+  ctx: any,
+  organizationId: string,
+  oldFileId: string,
+  documentId: string,
+): Promise<void> {
+  const orgSlug = await orgSlugFromIdOrNull(ctx, organizationId);
+  if (orgSlug === null) {
+    console.warn(
+      `[reindexDocumentInRag] org ${organizationId} unresolvable; skipping old RAG delete for oldFileId=${oldFileId} (documentId=${documentId})`,
+    );
+    return;
+  }
+  try {
+    const response = await ragFetch(
+      `/api/v1/documents/${encodeURIComponent(oldFileId)}`,
+      { method: 'DELETE', timeoutMs: 60_000, orgSlug },
+    );
+    if (!response.ok && response.status !== 404) {
+      console.warn(
+        `[reindexDocumentInRag] Failed to delete old RAG entry ${oldFileId}: ${response.status}`,
+      );
+    }
+  } catch (error) {
+    console.warn(
+      `[reindexDocumentInRag] Error deleting old RAG entry ${oldFileId}:`,
+      error,
+    );
+  }
+}
+
 const documentSourceTypeValidator = v.union(
   v.literal('markdown'),
   v.literal('html'),
@@ -194,8 +232,31 @@ export const checkRagDocumentStatus = internalAction({
       return null;
     }
 
+    // Resolve org slug OUTSIDE the retry-on-throw block. A missing slug
+    // (org row deleted, no slug field) is terminal — every subsequent
+    // retry would re-throw the same error and waste the scheduler
+    // budget before landing on the "max attempts" branch above. Mark
+    // the document failed in one shot instead.
+    const pollOrgSlug = await orgSlugFromIdOrNull(ctx, document.organizationId);
+    if (pollOrgSlug === null) {
+      console.warn(
+        `[checkRagDocumentStatus] org ${document.organizationId} unresolvable for document ${args.documentId}; marking ragInfo.failed (no retry)`,
+      );
+      await ctx.runMutation(
+        internal.documents.internal_mutations.updateDocumentRagInfo,
+        {
+          documentId: args.documentId,
+          ragInfo: {
+            status: 'failed',
+            error: 'Organization unresolvable (deleted or missing slug).',
+          },
+        },
+      );
+      return null;
+    }
+
     try {
-      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
+      const orgSlug = pollOrgSlug;
       const response = await ragFetch('/api/v1/documents/statuses', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
@@ -402,9 +463,25 @@ export const deleteDocumentFromRag = internalAction({
 
     const ragKey = document.fileId;
 
+    // Resolve slug OUTSIDE the retry-on-RAG-failure path. A missing slug
+    // (org row deleted) is terminal — previously each retry re-threw
+    // here, exhausted DELETE_RETRY_DELAYS, then "Document remains in
+    // database" forever. Treat slug-missing as "RAG-side index is gone
+    // too" and proceed with the local-row delete.
+    const orgSlug = await orgSlugFromIdOrNull(ctx, document.organizationId);
+    if (orgSlug === null) {
+      console.warn(
+        `[deleteDocumentFromRag] org ${document.organizationId} unresolvable; assuming RAG index already purged and deleting local document ${args.documentId}`,
+      );
+      await ctx.runMutation(
+        internal.documents.internal_mutations.deleteDocumentById,
+        { documentId: args.documentId },
+      );
+      return null;
+    }
+
     let ragSuccess = false;
     try {
-      const orgSlug = await orgSlugFromId(ctx, document.organizationId);
       const response = await ragFetch(
         `/api/v1/documents/${encodeURIComponent(ragKey)}`,
         { method: 'DELETE', timeoutMs: 60_000, orgSlug },
@@ -545,48 +622,41 @@ export const reindexDocumentInRag = internalAction({
   },
   returns: v.null(),
   handler: async (ctx, args): Promise<null> => {
-    // Look up current document so we can also schedule the new-upload
-    // step, but DON'T let a missing document skip the old-RAG delete —
-    // that would orphan oldFileId chunks. Resolve the delete org-scope
-    // from `oldOrganizationId` (preferred — captured by the scheduler
-    // caller at update time) and fall back to the current document only
-    // when missing.
+    // upload-then-delete order: upload the new file first; only purge
+    // `oldFileId` once the new chunks are committed. Previously the
+    // delete ran before the upload, so a failed upload left the doc
+    // with NO RAG entry (old chunks gone, new chunks never arrived) and
+    // no automatic retry. Keeping the old entry around while the new
+    // one queues means search still hits the previous revision until
+    // re-index completes, and a failed upload is recoverable by
+    // re-running this action on the same `oldFileId`.
     const document = await ctx.runQuery(
       internal.documents.internal_queries.getDocumentByIdRaw,
       { documentId: args.documentId },
     );
 
-    const deleteOrgId =
-      args.oldOrganizationId ?? document?.organizationId ?? null;
-    if (deleteOrgId) {
-      try {
-        const orgSlug = await orgSlugFromId(ctx, deleteOrgId);
-        const response = await ragFetch(
-          `/api/v1/documents/${encodeURIComponent(args.oldFileId)}`,
-          { method: 'DELETE', timeoutMs: 60_000, orgSlug },
+    // No current document or no new fileId — nothing to upload. We
+    // still attempt the old-RAG delete below so chunks don't leak.
+    if (!document || !document.fileId) {
+      const deleteOrgIdNoUpload =
+        args.oldOrganizationId ?? document?.organizationId ?? null;
+      if (deleteOrgIdNoUpload) {
+        await deleteOldRagEntry(
+          ctx,
+          deleteOrgIdNoUpload,
+          args.oldFileId,
+          args.documentId,
         );
-        if (!response.ok && response.status !== 404) {
-          console.warn(
-            `[reindexDocumentInRag] Failed to delete old RAG entry ${args.oldFileId}: ${response.status}`,
-          );
-        }
-      } catch (error) {
+      } else {
         console.warn(
-          `[reindexDocumentInRag] Error deleting old RAG entry ${args.oldFileId}:`,
-          error,
+          `[reindexDocumentInRag] No org context for old RAG delete; oldFileId ${args.oldFileId} may leak chunks (documentId=${args.documentId})`,
         );
       }
-    } else {
-      console.warn(
-        `[reindexDocumentInRag] No org context for old RAG delete; oldFileId ${args.oldFileId} may leak chunks (documentId=${args.documentId})`,
-      );
-    }
-
-    if (!document || !document.fileId) {
       return null;
     }
 
-    // Upload new file to RAG
+    // Upload new file to RAG FIRST.
+    let uploadSuccess = false;
     try {
       const rawResult = await ragAction.execute(
         ctx,
@@ -604,6 +674,7 @@ export const reindexDocumentInRag = internalAction({
         : false;
 
       if (success) {
+        uploadSuccess = true;
         await ctx.runMutation(
           internal.documents.internal_mutations.updateDocumentRagInfo,
           {
@@ -645,6 +716,27 @@ export const reindexDocumentInRag = internalAction({
       );
     }
 
+    // Only purge the old RAG entry once the new upload is committed.
+    // A failed upload leaves the previous chunks in place so search
+    // keeps returning the prior revision (degraded but consistent)
+    // instead of returning nothing.
+    if (uploadSuccess) {
+      const deleteOrgId =
+        args.oldOrganizationId ?? document.organizationId ?? null;
+      if (deleteOrgId) {
+        await deleteOldRagEntry(
+          ctx,
+          deleteOrgId,
+          args.oldFileId,
+          args.documentId,
+        );
+      } else {
+        console.warn(
+          `[reindexDocumentInRag] No org context for old RAG delete; oldFileId ${args.oldFileId} may leak chunks (documentId=${args.documentId})`,
+        );
+      }
+    }
+
     return null;
   },
 });
diff --git a/services/platform/convex/file_metadata/actions.ts b/services/platform/convex/file_metadata/actions.ts
index 5116a55a0e..e804a7a3b8 100644
--- a/services/platform/convex/file_metadata/actions.ts
+++ b/services/platform/convex/file_metadata/actions.ts
@@ -6,7 +6,7 @@ import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { action } from '../_generated/server';
 import { authComponent } from '../auth';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 
 /**
@@ -73,8 +73,23 @@ export const checkFileRagStatuses = action({
     const STALE_QUEUE_MS = 90_000;
 
     const mergedStatuses: Record<string, unknown> = {};
+    // Track storageIds whose org bucket queried RAG SUCCESSFULLY. Only
+    // these are eligible for the post-loop `expireStaleRagQueue` sweep —
+    // without this guard, a transient RAG outage in one org's request
+    // (or that org's slug going missing) made the loop `continue`, the
+    // org's storageIds never landed in `mergedStatuses`, and the sweep
+    // permanently marked them `failed` ("RAG service did not receive
+    // the upload") even though the uploads were healthy. Cross-org
+    // failure propagation.
+    const eligibleForStaleSweep = new Set<string>();
     for (const [organizationId, storageIds] of orgIdsToFiles) {
-      const orgSlug = await orgSlugFromId(ctx, organizationId);
+      const orgSlug = await orgSlugFromIdOrNull(ctx, organizationId);
+      if (orgSlug === null) {
+        console.warn(
+          `[checkFileRagStatuses] org ${organizationId} unresolvable; skipping status fetch (its storageIds will NOT be marked failed)`,
+        );
+        continue;
+      }
       try {
         const response = await ragFetch('/api/v1/documents/statuses', {
           method: 'POST',
@@ -92,6 +107,7 @@ export const checkFileRagStatuses = action({
         const body: unknown = await response.json();
         if (!isRecord(body) || !isRecord(body.statuses)) continue;
         Object.assign(mergedStatuses, body.statuses);
+        for (const id of storageIds) eligibleForStaleSweep.add(id);
       } catch (error) {
         console.warn(
           `[checkFileRagStatuses] Failed to fetch statuses for org ${orgSlug}:`,
@@ -106,10 +122,12 @@ export const checkFileRagStatuses = action({
     for (const storageId of allAuthorizedStorageIds) {
       const docStatus = statuses[storageId];
       if (!isRecord(docStatus)) {
-        await ctx.runMutation(
-          internal.file_metadata.internal_mutations.expireStaleRagQueue,
-          { storageId, staleAfterMs: STALE_QUEUE_MS },
-        );
+        if (eligibleForStaleSweep.has(storageId)) {
+          await ctx.runMutation(
+            internal.file_metadata.internal_mutations.expireStaleRagQueue,
+            { storageId, staleAfterMs: STALE_QUEUE_MS },
+          );
+        }
         continue;
       }
 
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index 6ad89b2d12..98341927aa 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -11,7 +11,7 @@ import {
   isUpstreamHttpError,
   UpstreamHttpError,
 } from '../lib/errors/upstream_http_error';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
 
 /**
@@ -102,6 +102,21 @@ export const extractFileMetadata = internalAction({
 
     // PDF/DOCX/PPTX: call crawler extract-metadata
     if (ext && EXTRACT_METADATA_EXTENSIONS.has(ext)) {
+      // Resolve org slug OUTSIDE the try block. Previously the lookup
+      // sat inside the catch-permanent branch — a slug miss got
+      // classified as "permanent" and stamped `visionRequired:false`,
+      // permanently disabling vision/OCR for legitimate uploads on a
+      // deleted-org race. With `OrNull` we exit cleanly without
+      // stamping a terminal marker, leaving the row's pending state
+      // alone so a subsequent ingest can re-run.
+      const orgSlug = await orgSlugFromIdOrNull(ctx, args.organizationId);
+      if (orgSlug === null) {
+        console.warn(
+          `[extractFileMetadata] org ${args.organizationId} unresolvable; skipping vision-metadata extraction for storageId=${args.storageId} (will not stamp permanent-failure marker)`,
+        );
+        return null;
+      }
+
       try {
         const fileUrl = await ctx.storage.getUrl(args.storageId);
         if (!fileUrl) {
@@ -123,7 +138,6 @@ export const extractFileMetadata = internalAction({
         const fileBlob = await fileResponse.blob();
         const crawlerUrl = getCrawlerUrl();
         const endpoint = `${crawlerUrl}/api/v1/${ext}/extract-metadata`;
-        const orgSlug = await orgSlugFromId(ctx, args.organizationId);
 
         const formData = new FormData();
         formData.append('file', fileBlob, args.fileName);
diff --git a/services/platform/convex/file_metadata/transcribe_audio.ts b/services/platform/convex/file_metadata/transcribe_audio.ts
index dd65bff87d..482de4b316 100644
--- a/services/platform/convex/file_metadata/transcribe_audio.ts
+++ b/services/platform/convex/file_metadata/transcribe_audio.ts
@@ -8,7 +8,7 @@ import type { ActionCtx } from '../_generated/server';
 import { internalAction } from '../_generated/server';
 import { estimateTranscriptionCostCents } from '../governance/cost_estimation';
 import { classifyError } from '../lib/error_classification';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import type { ResolvedModelData } from '../providers/resolve_model';
 import { resolveTranscriptionModel } from '../providers/resolve_model';
 import { uploadFile } from '../workflow_engine/action_defs/rag/helpers/upload_file_direct';
@@ -286,6 +286,14 @@ export const transcribeAudio = internalAction({
     const requestId = `transcribe-${args.storageId}-${Date.now()}`;
     const startedAt = Date.now();
 
+    // Resolve org slug ONCE up front and reuse on both the cached-path
+    // RAG index (≈ L404) and the post-Whisper RAG index (≈ L570).
+    // Previously each call site re-queried Better Auth, and a transient
+    // adapter failure on the SECOND lookup would bubble out into the
+    // outer catch — classifying as a transcription failure and re-
+    // queueing a fresh Whisper run on already-completed work.
+    const orgSlug = await orgSlugFromIdOrNull(ctx, args.organizationId);
+
     let compressed: CompressedAudio | undefined;
     let chunked:
       | { chunks: AudioChunk[]; cleanup: () => Promise<void> }
@@ -401,16 +409,21 @@ export const transcribeAudio = internalAction({
           // transcript was cached from). Duplicates content in RAG but
           // keeps per-upload citation identity correct; embeddings cost
           // is tiny compared to the Whisper call we just skipped.
-          const cachedOrgSlug = await orgSlugFromId(ctx, args.organizationId);
-          await indexTranscriptToRag(ctx, {
-            storageId: args.storageId,
-            fileName: args.fileName,
-            audioContentType: args.contentType,
-            transcript: cached.transcript ?? '',
-            chunkCount: 0,
-            requestId,
-            orgSlug: cachedOrgSlug,
-          });
+          if (orgSlug !== null) {
+            await indexTranscriptToRag(ctx, {
+              storageId: args.storageId,
+              fileName: args.fileName,
+              audioContentType: args.contentType,
+              transcript: cached.transcript ?? '',
+              chunkCount: 0,
+              requestId,
+              orgSlug,
+            });
+          } else {
+            console.warn(
+              `[transcribeAudio] org ${args.organizationId} unresolvable; transcript saved to fileMetadata but not indexed to RAG (cache path, storageId=${args.storageId})`,
+            );
+          }
           return null;
         }
       }
@@ -567,16 +580,21 @@ export const transcribeAudio = internalAction({
         );
       }
 
-      const indexOrgSlug = await orgSlugFromId(ctx, args.organizationId);
-      await indexTranscriptToRag(ctx, {
-        storageId: args.storageId,
-        fileName: args.fileName,
-        audioContentType: args.contentType,
-        transcript: fullTranscript,
-        chunkCount: chunks.length,
-        requestId,
-        orgSlug: indexOrgSlug,
-      });
+      if (orgSlug !== null) {
+        await indexTranscriptToRag(ctx, {
+          storageId: args.storageId,
+          fileName: args.fileName,
+          audioContentType: args.contentType,
+          transcript: fullTranscript,
+          chunkCount: chunks.length,
+          requestId,
+          orgSlug,
+        });
+      } else {
+        console.warn(
+          `[transcribeAudio] org ${args.organizationId} unresolvable; transcript saved to fileMetadata but not indexed to RAG (storageId=${args.storageId})`,
+        );
+      }
 
       return null;
     } catch (error) {
diff --git a/services/platform/convex/governance/erasure.ts b/services/platform/convex/governance/erasure.ts
index 17fd01d6d1..257b92afba 100644
--- a/services/platform/convex/governance/erasure.ts
+++ b/services/platform/convex/governance/erasure.ts
@@ -62,7 +62,7 @@ import {
 import * as ApprovalsHelpers from '../approvals/helpers';
 import { createAuditLog } from '../audit_logs/helpers';
 import { authComponent } from '../auth';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { hashEmailForAudit } from '../lib/helpers/pii_hash';
 import { ragFetch } from '../lib/helpers/rag_config';
 import { rateLimiter } from '../lib/rate_limiter';
@@ -127,6 +127,20 @@ async function subjectIsMemberOfOtherActiveOrgs(
     if (typeof r.role === 'string' && r.role === 'disabled') continue;
     return true;
   }
+  // 256-cap silent fail-open guard. If the membership query came back
+  // full, we can't tell whether the 257th-and-beyond row is the only
+  // non-excluded, non-disabled sibling — so fail CLOSED (treat as "yes,
+  // member of other active orgs") rather than wipe global throttle /
+  // 2FA counters under a subject who happens to belong to many orgs.
+  // The sibling 256-cap site in `http.ts:349` warns on truncation;
+  // mirror that here for parity.
+  if (rows.length >= 256) {
+    console.warn(
+      '[subjectIsMemberOfOtherActiveOrgs] hit 256-membership soft cap for userId ' +
+        `${userId}; returning fail-closed (assume member of other active orgs)`,
+    );
+    return true;
+  }
   return false;
 }
 
@@ -1800,25 +1814,36 @@ export const processErasureRequest = internalAction({
       documentsSkippedByHold = docResult.skippedByHold;
       // RAG is per-org; resolve once and reuse for all per-file DELETEs in
       // this erasure pass (subject is bound to a single organizationId).
-      const ragOrgSlug = await orgSlugFromId(ctx, state.organizationId);
-      for (const fileId of docResult.fileIds) {
-        try {
-          const res = await ragFetch(
-            `/api/v1/documents/${encodeURIComponent(fileId)}`,
-            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
-          );
-          if (res.ok || res.status === 404) {
-            ragDocumentsRemoved += 1;
-          } else {
+      // OrNull so a deleted-org subject (operator removed the org row but
+      // the erasure request was already in flight) still drives the DB-
+      // side cascade below; the RAG-side purge is the recoverable part
+      // (no per-tenant index to clean once the org is gone).
+      const ragOrgSlug = await orgSlugFromIdOrNull(ctx, state.organizationId);
+      if (ragOrgSlug === null) {
+        console.warn(
+          `[gdprErasure] org ${state.organizationId} unresolvable; skipping RAG-side fan-out for this erasure pass (DB-side cascade still runs)`,
+        );
+      }
+      if (ragOrgSlug !== null) {
+        for (const fileId of docResult.fileIds) {
+          try {
+            const res = await ragFetch(
+              `/api/v1/documents/${encodeURIComponent(fileId)}`,
+              { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
+            );
+            if (res.ok || res.status === 404) {
+              ragDocumentsRemoved += 1;
+            } else {
+              console.warn(
+                `[gdprErasure] RAG DELETE returned ${res.status} for fileId=${fileId}`,
+              );
+            }
+          } catch (error) {
             console.warn(
-              `[gdprErasure] RAG DELETE returned ${res.status} for fileId=${fileId}`,
+              `[gdprErasure] RAG DELETE failed for fileId=${fileId}:`,
+              error,
             );
           }
-        } catch (error) {
-          console.warn(
-            `[gdprErasure] RAG DELETE failed for fileId=${fileId}:`,
-            error,
-          );
         }
       }
 
@@ -1861,25 +1886,27 @@ export const processErasureRequest = internalAction({
       // alongside the DB row + the `_storage` blob.
       // `perCategory.fileMetadata` already typed as `FileMetadataCounts`
       // which declares `ragPurgeStorageIds?: string[]` — no cast needed.
-      for (const storageId of perCategory.fileMetadata.ragPurgeStorageIds ??
-        []) {
-        try {
-          const res = await ragFetch(
-            `/api/v1/documents/${encodeURIComponent(storageId)}`,
-            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
-          );
-          if (res.ok || res.status === 404) {
-            ragDocumentsRemoved += 1;
-          } else {
+      if (ragOrgSlug !== null) {
+        for (const storageId of perCategory.fileMetadata.ragPurgeStorageIds ??
+          []) {
+          try {
+            const res = await ragFetch(
+              `/api/v1/documents/${encodeURIComponent(storageId)}`,
+              { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
+            );
+            if (res.ok || res.status === 404) {
+              ragDocumentsRemoved += 1;
+            } else {
+              console.warn(
+                `[gdprErasure] RAG DELETE returned ${res.status} for chat-upload storageId=${storageId}`,
+              );
+            }
+          } catch (error) {
             console.warn(
-              `[gdprErasure] RAG DELETE returned ${res.status} for chat-upload storageId=${storageId}`,
+              `[gdprErasure] RAG DELETE failed for chat-upload storageId=${storageId}:`,
+              error,
             );
           }
-        } catch (error) {
-          console.warn(
-            `[gdprErasure] RAG DELETE failed for chat-upload storageId=${storageId}:`,
-            error,
-          );
         }
       }
       // videoLinkJobs are erased here, AFTER fileMetadata, so the
@@ -1896,24 +1923,27 @@ export const processErasureRequest = internalAction({
           userId: state.targetUserId,
         },
       );
-      for (const storageId of perCategory.videoLinks.ragPurgeStorageIds ?? []) {
-        try {
-          const res = await ragFetch(
-            `/api/v1/documents/${encodeURIComponent(storageId)}`,
-            { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
-          );
-          if (res.ok || res.status === 404) {
-            ragDocumentsRemoved += 1;
-          } else {
+      if (ragOrgSlug !== null) {
+        for (const storageId of perCategory.videoLinks.ragPurgeStorageIds ??
+          []) {
+          try {
+            const res = await ragFetch(
+              `/api/v1/documents/${encodeURIComponent(storageId)}`,
+              { method: 'DELETE', timeoutMs: 10_000, orgSlug: ragOrgSlug },
+            );
+            if (res.ok || res.status === 404) {
+              ragDocumentsRemoved += 1;
+            } else {
+              console.warn(
+                `[gdprErasure] RAG DELETE returned ${res.status} for video-link storageId=${storageId}`,
+              );
+            }
+          } catch (error) {
             console.warn(
-              `[gdprErasure] RAG DELETE returned ${res.status} for video-link storageId=${storageId}`,
+              `[gdprErasure] RAG DELETE failed for video-link storageId=${storageId}:`,
+              error instanceof Error ? error.message : error,
             );
           }
-        } catch (error) {
-          console.warn(
-            `[gdprErasure] RAG DELETE failed for video-link storageId=${storageId}:`,
-            error instanceof Error ? error.message : error,
-          );
         }
       }
       perCategory.usageLedger = await ctx.runMutation(
diff --git a/services/platform/convex/governance/retention_cleanup.ts b/services/platform/convex/governance/retention_cleanup.ts
index c094663af8..067246de23 100644
--- a/services/platform/convex/governance/retention_cleanup.ts
+++ b/services/platform/convex/governance/retention_cleanup.ts
@@ -9,7 +9,7 @@ import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import type { ActionCtx } from '../_generated/server';
 import { internalAction } from '../_generated/server';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { ragFetch } from '../lib/helpers/rag_config';
 import type { ActiveHolds } from './legal_hold';
 import {
@@ -199,7 +199,14 @@ async function cleanupDocuments(
           { organizationId: org.organizationId, cutoffMs, batchSize },
         );
 
-  const orgSlug = await orgSlugFromId(ctx, org.organizationId);
+  // Empty-batch fast path: skip the Better Auth slug lookup when nothing
+  // expired. Steady-state per-org per-tick is the common case.
+  if (passB.length === 0) return processed;
+
+  // OrNull so a deleted-org cleanup (operator removed the org row but
+  // legacy documents survived) still removes local rows; the RAG-side
+  // purge is the recoverable part (the tenant index is gone too).
+  const orgSlug = await orgSlugFromIdOrNull(ctx, org.organizationId);
 
   for (const doc of passB) {
     if (doc.createdBy && holds.userMembershipIds.has(doc.createdBy)) {
@@ -209,7 +216,7 @@ async function cleanupDocuments(
       continue;
     }
 
-    if (doc.fileId) {
+    if (doc.fileId && orgSlug !== null) {
       await deleteRagEntry(orgSlug, doc.fileId, `document ${doc._id}`);
     }
 
@@ -309,7 +316,9 @@ async function cleanupTempFiles(
           { organizationId: org.organizationId, source, cutoffMs, batchSize },
         );
 
-  const tempOrgSlug = await orgSlugFromId(ctx, org.organizationId);
+  if (passB.length === 0) return processed;
+
+  const tempOrgSlug = await orgSlugFromIdOrNull(ctx, org.organizationId);
 
   for (const file of passB) {
     if (file.uploadedBy && holds.userMembershipIds.has(file.uploadedBy)) {
@@ -319,7 +328,13 @@ async function cleanupTempFiles(
       continue;
     }
 
-    await deleteRagEntry(tempOrgSlug, file.storageId, `temp file ${file._id}`);
+    if (tempOrgSlug !== null) {
+      await deleteRagEntry(
+        tempOrgSlug,
+        file.storageId,
+        `temp file ${file._id}`,
+      );
+    }
 
     await ctx.runMutation(
       internal.governance.internal_mutations_retention.deleteExpiredTempFile,
diff --git a/services/platform/convex/threads/cascade_helpers.ts b/services/platform/convex/threads/cascade_helpers.ts
index 672fafbc8e..7c6cf9526b 100644
--- a/services/platform/convex/threads/cascade_helpers.ts
+++ b/services/platform/convex/threads/cascade_helpers.ts
@@ -33,7 +33,7 @@ import type { MutationCtx } from '../_generated/server';
 import { createAuditLog } from '../audit_logs/helpers';
 import type { ActiveHolds } from '../governance/legal_hold';
 import { loadActiveHolds } from '../governance/legal_hold';
-import { orgSlugFromId } from '../lib/helpers/org_slug';
+import { orgSlugFromIdOrNull } from '../lib/helpers/org_slug';
 import { parseSubThreadIds } from './delete_chat_thread';
 
 // Audit actions emitted by this file. Keep grep-able:
@@ -317,49 +317,55 @@ export async function cascadeDeleteThreadChildren(
         q.eq('organizationId', organizationId).eq('threadId', threadId),
       )
       .take(PAGE_SIZE);
-    // Resolve slug BEFORE the delete loop. Previously the lookup ran
-    // after every storage.delete + db.delete had committed; if it threw
-    // (org row deleted mid-cascade, replica skew), the DB tx rolls back
-    // so fileMetadata rows reappear, but `ctx.storage.delete` is out-
-    // of-band and NOT rolled back — the blob is gone AND no RAG purge
-    // was scheduled. Resolve first so a slug-lookup failure aborts the
-    // loop before any destructive op runs.
-    let orgSlug: string;
-    try {
-      orgSlug = await orgSlugFromId(ctx, organizationId);
-    } catch (error) {
-      console.warn(
-        `[cascadeDeleteThreadChildren] orgSlugFromId failed for ${organizationId}; deferring file cascade:`,
-        error instanceof Error ? error.message : error,
-      );
-      // Signal "not done" so the caller retries. The fileMetadata page
-      // is still present so a re-run will find it.
-      return { done: false, remaining: 1 };
-    }
-    const ragPurgeStorageIds: string[] = [];
-    for (const fileMeta of filesPage) {
-      try {
-        await ctx.storage.delete(fileMeta.storageId);
-      } catch (error) {
+    // Empty-page fast path: no files to cascade for this thread, skip
+    // the (relatively expensive) Better Auth slug lookup. Common case
+    // for chat-upload-less threads.
+    if (filesPage.length > 0) {
+      // Resolve slug BEFORE the delete loop. Previously the lookup ran
+      // after every storage.delete + db.delete had committed; if it
+      // threw, the DB tx rolls back so fileMetadata rows reappear but
+      // `ctx.storage.delete` is out-of-band and NOT rolled back.
+      // Resolving first means a slug-lookup failure aborts the loop
+      // before any destructive op runs.
+      //
+      // We use `orgSlugFromIdOrNull` so a TERMINAL miss (org row gone,
+      // missing slug — both unrecoverable) drops only the RAG-side
+      // purge; the local fileMetadata rows + `_storage` blobs are
+      // still cleaned and the cascade reports `done`. Previously a
+      // throw here returned `{done:false, remaining:1}` forever, the
+      // retention sweep's MAX_ATTEMPTS budget would exhaust, and the
+      // orphan rows + blobs accumulated indefinitely.
+      const orgSlug = await orgSlugFromIdOrNull(ctx, organizationId);
+      if (orgSlug === null) {
         console.warn(
-          `[cascadeDeleteThreadChildren] storage.delete failed for ${String(fileMeta.storageId)}:`,
-          error instanceof Error ? error.message : error,
+          `[cascadeDeleteThreadChildren] org ${organizationId} unresolvable (deleted/missing slug); cleaning local fileMetadata + storage but skipping RAG purge`,
         );
       }
-      ragPurgeStorageIds.push(String(fileMeta.storageId));
-      filesPageStorageIds.push(String(fileMeta.storageId));
-      await ctx.db.delete(fileMeta._id);
-    }
-    if (ragPurgeStorageIds.length > 0) {
-      await ctx.scheduler.runAfter(
-        0,
-        internal.workflow_engine.action_defs.rag.helpers.delete_document
-          .deleteFromRagBatch,
-        { orgSlug, fileIds: ragPurgeStorageIds },
-      );
-    }
-    if (filesPage.length === PAGE_SIZE) {
-      return { done: false, remaining: 1 };
+      const ragPurgeStorageIds: string[] = [];
+      for (const fileMeta of filesPage) {
+        try {
+          await ctx.storage.delete(fileMeta.storageId);
+        } catch (error) {
+          console.warn(
+            `[cascadeDeleteThreadChildren] storage.delete failed for ${String(fileMeta.storageId)}:`,
+            error instanceof Error ? error.message : error,
+          );
+        }
+        ragPurgeStorageIds.push(String(fileMeta.storageId));
+        filesPageStorageIds.push(String(fileMeta.storageId));
+        await ctx.db.delete(fileMeta._id);
+      }
+      if (orgSlug !== null && ragPurgeStorageIds.length > 0) {
+        await ctx.scheduler.runAfter(
+          0,
+          internal.workflow_engine.action_defs.rag.helpers.delete_document
+            .deleteFromRagBatch,
+          { orgSlug, fileIds: ragPurgeStorageIds },
+        );
+      }
+      if (filesPage.length === PAGE_SIZE) {
+        return { done: false, remaining: 1 };
+      }
     }
   }
 

From 283ff91084f79d78adf9324ce992d03310161787 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:45:32 +0800
Subject: [PATCH 36/41] =?UTF-8?q?fix(platform):=20round-3=20=E2=80=94=20we?=
 =?UTF-8?q?bsites=20REST/action=20alignment=20+=20sync=20debounce=20stamp?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`syncSingleWebsite` was the missing writer of `metadata.lastStatusSyncAt`:
`fetchPages` debounced on this field but the per-website sync action
never stamped it, so the debounce gate stayed open forever and every
subsequent `fetchPages` re-scheduled a new sync. Stamp the timestamp on
all three patch branches (success / not-found / error).

REST `/api/v1/websites/...` and the Convex `actions.*` surface used to
diverge in two important ways:

- DELETE only removed the `websites` row and left the crawler with a
  dangling registration; the Convex action correctly called
  `deregisterDomainFromCrawler` first. Add a new
  `deregisterAndDelete` internal action and route both surfaces through
  it so the crawler binding always goes away together with the row.

- `createWebsite` REST awaited `runAction(registerAndSync, ...)` while
  the Convex action used `scheduler.runAfter(0, ...)`. Likewise the
  `POST /:id/sync` REST sub-action awaited the full crawler round-trip
  before returning `{status: 'syncing'}`. Switch REST to `runAfter(0)`
  so the response is fire-and-forget in both places, the `'syncing'`
  status is honest, and caller latency stops being tied to the crawler.
---
 .../convex/websites/internal_actions.ts       | 47 +++++++++++++++++++
 services/platform/convex/websites/rest_api.ts | 46 +++++++++++++-----
 2 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/services/platform/convex/websites/internal_actions.ts b/services/platform/convex/websites/internal_actions.ts
index 1c77aec63f..be221c9ccb 100644
--- a/services/platform/convex/websites/internal_actions.ts
+++ b/services/platform/convex/websites/internal_actions.ts
@@ -341,6 +341,41 @@ export const registerAndSync = internalAction({
   },
 });
 
+/**
+ * Internal-action equivalent of `actions.deleteWebsite`'s body:
+ * deregister the crawler binding first, then delete the row. The REST
+ * `DELETE /api/v1/websites/:id` path delegates to this so REST and the
+ * Convex action have the same shape — without it, REST deleted the
+ * `websites` row but left the crawler with a dangling registration that
+ * would keep scanning and produce "website not found in crawler" errors
+ * if the same domain was re-added later.
+ *
+ * Caller is responsible for verifying caller membership / row
+ * ownership BEFORE invoking (REST: `withRestAuth` + the existing
+ * `organizationId !== rc.org.organizationId` check at the call site).
+ */
+export const deregisterAndDelete = internalAction({
+  args: {
+    websiteId: v.id('websites'),
+    organizationId: v.string(),
+  },
+  handler: async (ctx, args): Promise<void> => {
+    const website = await ctx.runQuery(
+      internal.websites.internal_queries.getWebsite,
+      { websiteId: args.websiteId },
+    );
+    if (!website) return;
+    const orgSlug = await orgSlugFromId(ctx, args.organizationId);
+    // Deregister first so a crawler-side failure surfaces to the
+    // caller (matches `actions.deleteWebsite` semantics) and the row
+    // is left in place for retry rather than orphaning the registration.
+    await deregisterDomainFromCrawler(orgSlug, website.domain);
+    await ctx.runMutation(internal.websites.internal_mutations.deleteWebsite, {
+      websiteId: args.websiteId,
+    });
+  },
+});
+
 export const syncSingleWebsite = internalAction({
   args: {
     websiteId: v.id('websites'),
@@ -355,6 +390,15 @@ export const syncSingleWebsite = internalAction({
     );
     if (!website) return;
 
+    // Every patch below MUST write `lastStatusSyncAt: Date.now()`.
+    // `fetchPages` debounces re-fan-out using exactly this field — if
+    // the success / error / missing branches forget to stamp it, the
+    // debounce gate stays permanently open and every subsequent
+    // `fetchPages` call schedules a fresh sync, defeating the rate-
+    // limit and reintroducing the concurrent-write race the gate was
+    // added to prevent.
+    const syncTimestamp = Date.now();
+
     try {
       const info = await fetchWebsiteInfo(orgSlug, args.domain);
 
@@ -374,6 +418,7 @@ export const syncSingleWebsite = internalAction({
             metadata: {
               ...website.metadata,
               lastSyncError: undefined,
+              lastStatusSyncAt: syncTimestamp,
             },
           },
         );
@@ -387,6 +432,7 @@ export const syncSingleWebsite = internalAction({
               ...website.metadata,
               lastSyncError:
                 'Website not found in crawler. Please delete and re-add it.',
+              lastStatusSyncAt: syncTimestamp,
             },
           },
         );
@@ -402,6 +448,7 @@ export const syncSingleWebsite = internalAction({
         metadata: {
           ...website.metadata,
           lastSyncError: message,
+          lastStatusSyncAt: syncTimestamp,
         },
       });
     }
diff --git a/services/platform/convex/websites/rest_api.ts b/services/platform/convex/websites/rest_api.ts
index 13450ad7eb..4d48439fc8 100644
--- a/services/platform/convex/websites/rest_api.ts
+++ b/services/platform/convex/websites/rest_api.ts
@@ -71,13 +71,22 @@ export const createWebsite = withRestAuth('rest:api', async (rc, request) => {
     },
   );
 
-  // Register with crawler and schedule follow-up sync
-  await rc.ctx.runAction(internal.websites.internal_actions.registerAndSync, {
-    websiteId,
-    domain,
-    scanInterval: body.scanInterval,
-    organizationId: rc.org.organizationId,
-  });
+  // Register with crawler and schedule follow-up sync.
+  // Fire-and-forget via the scheduler — matches `actions.createWebsite`
+  // (the Convex-action surface), so REST + Convex both return as soon
+  // as the row exists rather than blocking the HTTP response on a
+  // crawler round-trip. `registerAndSync` patches the row to
+  // `status: 'error'` on its own failure path.
+  await rc.ctx.scheduler.runAfter(
+    0,
+    internal.websites.internal_actions.registerAndSync,
+    {
+      websiteId,
+      domain,
+      scanInterval: body.scanInterval,
+      organizationId: rc.org.organizationId,
+    },
+  );
 
   return jsonCreated({ id: websiteId });
 });
@@ -195,9 +204,17 @@ export const deleteWebsite = withRestAuth('rest:api', async (rc, request) => {
     return jsonError('Website not found', 404);
   }
 
-  await rc.ctx.runMutation(internal.websites.internal_mutations.deleteWebsite, {
-    websiteId: toId<'websites'>(id),
-  });
+  // Route through `deregisterAndDelete` so the crawler binding is
+  // removed before the row goes — REST + the Convex `actions.deleteWebsite`
+  // surface now share the same shape. Previously REST deleted the
+  // row directly, leaving the crawler with a dangling registration.
+  await rc.ctx.runAction(
+    internal.websites.internal_actions.deregisterAndDelete,
+    {
+      websiteId: toId<'websites'>(id),
+      organizationId: rc.org.organizationId,
+    },
+  );
 
   return jsonNoContent();
 });
@@ -276,7 +293,14 @@ export const websitePostActions = withRestAuth(
       // callers got an org-wide side effect when they thought they were
       // re-syncing one row. Use the per-website action so the contract
       // matches the URL (round-3 P2 R9-P2-a).
-      await rc.ctx.runAction(
+      //
+      // Fire-and-forget via `scheduler.runAfter(0, ...)` so the HTTP
+      // response actually means "syncing started" (matches the
+      // returned status). Previously `runAction` blocked the response
+      // until the crawler round-trip finished, making the `'syncing'`
+      // body misleading and tying caller latency to the crawler.
+      await rc.ctx.scheduler.runAfter(
+        0,
         internal.websites.internal_actions.syncSingleWebsite,
         {
           websiteId: website._id,

From 257fd756550b9c51a7a6029c07abb823910d7647 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:45:55 +0800
Subject: [PATCH 37/41] =?UTF-8?q?fix(platform):=20round-3=20=E2=80=94=20ag?=
 =?UTF-8?q?ent=20file=5Factions=20logs,=20SSE=20rate-limit,=20scatter=20ME?=
 =?UTF-8?q?DIUM=20cleanup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bundle of round-3 hardening items that touch one or two methods each.

agents/file_actions.ts:
  - rename unlink is now ENOENT-aware; non-ENOENT errors log instead of
    silently leaving the old file on disk next to the new one (which
    `listAgents` would then surface twice).
  - `deleteAgent.preDelete` bare-catch now warn-logs the underlying
    error so the audit-without-previousState case is explainable.

agents/file_utils.ts:
  `resolveHistoryDir` now runs the agent name through `validateAgentName`
  and uses nested `safeJoinWithinDir` calls. The standalone history
  callers (listHistory / readHistoryEntry / restoreFromHistory) reach
  this helper BEFORE any path validator runs, so a crafted name
  containing `..` would otherwise traverse out of `agents/.history/`.

documents/generate_docx.ts:
  The `{success:false, error}` JSON branch now scrubs `result.error`
  through `sanitizeError` before throwing — matches the HTTP-error
  branch above (which already did) so an upstream body that echoes
  e.g. `Authorization: Bearer …` is redacted on both paths.

lib/file_io.ts:
  `safeJoinWithinDir` now explicitly rejects an empty `name`. Empty
  used to resolve to `dir` itself, which would let an unvalidated
  empty string from user input land on the config root.

agent_tools/web/helpers/search_pages.ts:
  Add a 15 s `AbortController` to `fetchSearch`. A hung crawler
  connection used to block the agent step indefinitely (no signal
  passed). Aligns with `query_web_context.ts`'s 10 s and
  `fetch_and_extract.ts`'s 300 s.

http.ts + lib/rate_limiter/index.ts:
  Add a `security:sse-auth` rate-limit bucket and gate `/api/sse/auth`
  on it BEFORE the Better Auth session lookup. Mirrors the
  `/api/tts-audio` pattern. Anonymous floods can no longer force a
  session-table read per request.

lib/utils/sanitize_secrets.ts:
  Add pipe-delimited self-hosted Convex admin-key patterns (`<INSTANCE>
  |<base64>`), the `--admin-key VALUE` argv form, and a bare-payload
  pattern. Mirrors the CLI `ADMIN_KEY_RE` in `reseed-all-orgs.ts` so
  the shared `UpstreamHttpError.bodySnippet` scrubber and the CLI log
  redactor stay in lockstep.

workflow_engine/action_defs/crawler/crawler_action.ts:
  `result.success===false` branches now throw `UpstreamHttpError` with
  `retryable:false` instead of a plain `Error`. The workflow retry
  layer can finally distinguish transport-level failures (already
  typed) from body-level "crawler said no".

workflow_engine/action_defs/document/helpers/apply_docx_structured.ts:
  Correct the `UpstreamHttpError` endpoint label from
  `/api/v1/apply-structured` to `/api/v1/docx/apply-structured` to
  match the actual request URL.
---
 .../agent_tools/web/helpers/search_pages.ts   | 39 +++++++++++++------
 .../platform/convex/agents/file_actions.ts    | 25 +++++++++++-
 services/platform/convex/agents/file_utils.ts | 14 ++++++-
 .../convex/documents/generate_docx.ts         | 13 ++++++-
 services/platform/convex/http.ts              | 22 +++++++++++
 services/platform/convex/lib/file_io.ts       |  9 +++++
 .../platform/convex/lib/rate_limiter/index.ts | 12 ++++++
 .../convex/lib/utils/sanitize_secrets.ts      | 14 +++++++
 .../action_defs/crawler/crawler_action.ts     | 23 ++++++++++-
 .../document/helpers/apply_docx_structured.ts |  2 +-
 10 files changed, 154 insertions(+), 19 deletions(-)

diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
index 88ef56b214..6ebbcb3bfa 100644
--- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts
+++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts
@@ -42,6 +42,12 @@ export function isValidDomain(domain: string): boolean {
   return DOMAIN_PATTERN.test(domain);
 }
 
+// 15 s aligns with `query_web_context.ts` (10 s) at the short end and
+// stays well below the agent-runtime tool budget. A hung crawler
+// connection used to block here indefinitely (no signal/timeout) and
+// tie the entire agent step to the crawler's stall window.
+const SEARCH_TIMEOUT_MS = 15_000;
+
 async function fetchSearch(
   crawlerUrl: string,
   orgSlug: string,
@@ -52,18 +58,27 @@ async function fetchSearch(
     ? `${crawlerUrl}/api/v1/search/${encodeURIComponent(domain)}`
     : `${crawlerUrl}/api/v1/search`;
 
-  const response = await fetch(endpoint, {
-    method: 'POST',
-    headers: {
-      'Content-Type': 'application/json',
-      'x-tale-org': orgSlug,
-    },
-    body: JSON.stringify({
-      query,
-      limit: DEFAULT_LIMIT,
-      similarity_threshold: DEFAULT_SIMILARITY_THRESHOLD,
-    }),
-  });
+  const controller = new AbortController();
+  const timeoutId = setTimeout(() => controller.abort(), SEARCH_TIMEOUT_MS);
+
+  let response: Response;
+  try {
+    response = await fetch(endpoint, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'x-tale-org': orgSlug,
+      },
+      body: JSON.stringify({
+        query,
+        limit: DEFAULT_LIMIT,
+        similarity_threshold: DEFAULT_SIMILARITY_THRESHOLD,
+      }),
+      signal: controller.signal,
+    });
+  } finally {
+    clearTimeout(timeoutId);
+  }
 
   if (!response.ok) {
     const errorText = await response.text().catch(() => '');
diff --git a/services/platform/convex/agents/file_actions.ts b/services/platform/convex/agents/file_actions.ts
index b891697cf0..49968141a9 100644
--- a/services/platform/convex/agents/file_actions.ts
+++ b/services/platform/convex/agents/file_actions.ts
@@ -29,6 +29,7 @@ import {
 } from '../lib/auth/require_org_membership';
 import {
   atomicWrite,
+  errnoCode,
   generateHistoryTimestamp,
   handleDirReadError,
   pruneHistory,
@@ -416,7 +417,19 @@ export const saveAgent = action({
         });
       }
       const oldFilePath = resolveAgentFilePath(orgSlug, args.oldAgentName);
-      await unlink(oldFilePath).catch(() => {});
+      // ENOENT-tolerant only — silently swallowing EACCES/EBUSY/EIO
+      // would leave the OLD file on disk while the NEW file is being
+      // written next to it, so `listAgents` would surface the same
+      // agent twice and the audit log would record a rename that
+      // didn't fully complete.
+      await unlink(oldFilePath).catch((err: unknown) => {
+        if (errnoCode(err) !== 'ENOENT') {
+          console.warn(
+            `[saveAgent] unlink old agent file ${oldFilePath} failed:`,
+            err,
+          );
+        }
+      });
     }
 
     await atomicWrite(filePath, content);
@@ -615,7 +628,15 @@ export const deleteAgent = action({
     let preDelete: AgentReadResult | undefined;
     try {
       preDelete = await readAgentFile(orgSlug, args.agentName);
-    } catch {
+    } catch (err) {
+      // Best-effort snapshot per the comment block above. Log the
+      // underlying error so the audit-row-without-previousState case
+      // is explainable in post-mortem (vs the prior silent swallow
+      // which gave no signal about what went wrong).
+      console.warn(
+        `[deleteAgent] preDelete capture failed for ${args.agentName}:`,
+        err,
+      );
       preDelete = undefined;
     }
 
diff --git a/services/platform/convex/agents/file_utils.ts b/services/platform/convex/agents/file_utils.ts
index 08269e5801..477617342c 100644
--- a/services/platform/convex/agents/file_utils.ts
+++ b/services/platform/convex/agents/file_utils.ts
@@ -149,7 +149,19 @@ export function resolveAgentFilePath(
 }
 
 export function resolveHistoryDir(orgSlug: string, agentName: string): string {
-  return path.join(resolveAgentsDir(orgSlug), '.history', agentName);
+  // Defence-in-depth: `listHistory`, `readHistoryEntry`, and
+  // `restoreFromHistory` invoke this BEFORE any
+  // `resolveAgentFilePath`-style validation runs, so a crafted
+  // `agentName` containing `..` would otherwise traverse out of
+  // `agents/.history/`. Mirror the agent-name + safeJoin guard the
+  // other path builders already do.
+  if (!validateAgentName(agentName)) {
+    throw new Error(`Invalid agent name: ${agentName}`);
+  }
+  return safeJoinWithinDir(
+    safeJoinWithinDir(resolveAgentsDir(orgSlug), '.history'),
+    agentName,
+  );
 }
 
 export { MAX_FILE_SIZE_BYTES, MAX_HISTORY_ENTRIES };
diff --git a/services/platform/convex/documents/generate_docx.ts b/services/platform/convex/documents/generate_docx.ts
index 2f127e4928..937f06f05c 100644
--- a/services/platform/convex/documents/generate_docx.ts
+++ b/services/platform/convex/documents/generate_docx.ts
@@ -98,7 +98,18 @@ export async function generateDocx(
   const result = await response.json();
 
   if (!result.success || !result.file_base64) {
-    throw new Error(result.error || 'Failed to generate DOCX');
+    // Sanitise the upstream `result.error` before it lands in the
+    // thrown message. The HTTP-error path above already runs through
+    // `sanitizeError` via `UpstreamHttpError.fromResponse`; this
+    // body-level branch is the second escape hatch and must scrub
+    // too — otherwise a crawler that 200s with `{"success":false,
+    // "error":"Authorization: Bearer ..."}` would leak the secret
+    // straight into the agent boundary.
+    const rawErr =
+      typeof result.error === 'string'
+        ? result.error
+        : 'Failed to generate DOCX';
+    throw new Error(sanitizeError(rawErr));
   }
 
   // Decode base64 and upload to Convex storage
diff --git a/services/platform/convex/http.ts b/services/platform/convex/http.ts
index 32175d2c5a..6045bde9a7 100644
--- a/services/platform/convex/http.ts
+++ b/services/platform/convex/http.ts
@@ -317,6 +317,28 @@ http.route({
   path: '/api/sse/auth',
   method: 'GET',
   handler: httpAction(async (ctx, req) => {
+    // Mirror the `/api/tts-audio` and `/storage` routes: rate-limit
+    // BEFORE the session lookup so an anonymous flood can't force a
+    // Better Auth DB session-query per request. The browser EventSource
+    // hitting this endpoint passes the auth cookie, so the limit applies
+    // to anonymous probes only — authenticated SSE handshakes stay
+    // unthrottled in practice.
+    const trusted = await loadTrustedProxies(ctx);
+    const ip = getClientIp(req.headers, trusted);
+    try {
+      await checkIpRateLimit(ctx, 'security:sse-auth', ip);
+    } catch (error) {
+      if (error instanceof RateLimitExceededError) {
+        return new Response('Rate limit exceeded', {
+          status: 429,
+          headers: {
+            'Retry-After': String(Math.ceil(error.retryAfter / 1000)),
+          },
+        });
+      }
+      throw error;
+    }
+
     const auth = createAuth(ctx);
     const session = await auth.api.getSession({ headers: req.headers });
     if (!session?.user) {
diff --git a/services/platform/convex/lib/file_io.ts b/services/platform/convex/lib/file_io.ts
index 0a55cb8cd9..48eccc689c 100644
--- a/services/platform/convex/lib/file_io.ts
+++ b/services/platform/convex/lib/file_io.ts
@@ -132,6 +132,15 @@ export function getConfigRoot(area?: string): string {
  * defense-in-depth backstop, not the primary validator.
  */
 export function safeJoinWithinDir(dir: string, name: string): string {
+  // Empty name resolves to `dir` itself — every callable site of this
+  // helper expects to land on a CHILD of `dir`, so an empty name is a
+  // bug at the call site (likely an unvalidated empty string from user
+  // input). Reject it explicitly rather than silently returning the
+  // parent directory's path, which would let a caller `unlink` /
+  // `rm -rf` the whole config root.
+  if (name === '') {
+    throw new Error('Path traversal detected: empty name');
+  }
   const resolved = path.resolve(dir, name);
   const expectedPrefix = path.resolve(dir);
   if (
diff --git a/services/platform/convex/lib/rate_limiter/index.ts b/services/platform/convex/lib/rate_limiter/index.ts
index 665b41de7c..c5751ebebd 100644
--- a/services/platform/convex/lib/rate_limiter/index.ts
+++ b/services/platform/convex/lib/rate_limiter/index.ts
@@ -226,6 +226,18 @@ export const rateLimiter = new RateLimiter(components.rateLimiter, {
     rate: 200,
     period: MINUTE,
   },
+  // Per-IP throttle on the SSE-auth handshake route. Same shape as
+  // `security:tts-audio-fetch` — anonymous flooding here forces a
+  // Better Auth session-table read per request, so cost protection
+  // matters more than data protection (the route 401s on no-session).
+  // Token bucket so a freshly-logged-in user reconnecting across
+  // multiple browser tabs doesn't hit a 429 cliff.
+  'security:sse-auth': {
+    kind: 'token bucket',
+    rate: 60,
+    period: MINUTE,
+    capacity: 120,
+  },
   'security:login-ip': {
     kind: 'fixed window',
     rate: 30,
diff --git a/services/platform/convex/lib/utils/sanitize_secrets.ts b/services/platform/convex/lib/utils/sanitize_secrets.ts
index 09e207d8fc..8c08ce13b7 100644
--- a/services/platform/convex/lib/utils/sanitize_secrets.ts
+++ b/services/platform/convex/lib/utils/sanitize_secrets.ts
@@ -45,6 +45,20 @@ const SECRET_PATTERNS: ReadonlyArray<RegExp> = [
   /\bgithub_pat_[A-Za-z0-9_]{40,}/g,
   // Convex deploy keys, e.g. `convex_dev_…` / `convex_prod_…`.
   /\bconvex_[a-z]+_[A-Za-z0-9_-]{20,}/g,
+  // Self-hosted Convex admin keys: `<INSTANCE>|<base64>` shape — e.g.
+  // `tale_platform|01abc…`. Mirrors the CLI redactor's `ADMIN_KEY_RE`
+  // (tools/cli/src/lib/actions/reseed-all-orgs.ts) so the convex
+  // boundary's `UpstreamHttpError.bodySnippet` scrubber and the CLI
+  // log redactor stay in lockstep.
+  /\b[Aa]dmin[\s\-_][Kk]ey\s*[:=]?\s*[A-Za-z0-9+/=._\-|]{12,}/g,
+  /--admin-key([\s=]+)\S+/g,
+  // Bare pipe-delimited self-hosted admin-key payloads when the
+  // "Admin Key" / "--admin-key" label is missing (e.g. an upstream
+  // body dump that just echoes the value). Conservative match: at
+  // least 4 chars before the pipe, 20+ after, restricted to the
+  // base64-+ instance-name charset, so unrelated JSON / URLs don't
+  // false-positive.
+  /\b[A-Za-z0-9_-]{4,}\|[A-Za-z0-9+/=._\-|]{20,}/g,
   // JWTs: three dot-separated base64url segments. Length floor guards
   // against accidentally matching version strings like `1.2.3`.
   /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b/g,
diff --git a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
index 63cb99375e..6481970b06 100644
--- a/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
+++ b/services/platform/convex/workflow_engine/action_defs/crawler/crawler_action.ts
@@ -154,10 +154,22 @@ async function discoverUrls(
   const result: DiscoverUrlsRawData = await response.json();
 
   if (!result.success) {
+    // Wrap as `UpstreamHttpError` (non-retryable) so the workflow retry
+    // layer can distinguish "crawler said no" from transport failures
+    // (which already throw `UpstreamHttpError`). Treating both as
+    // generic `Error` lost the structured retry signal — a transient
+    // crawler-internal error indistinguishable from a permanent one.
     const errorMessage =
       // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- dynamic data
       (result as { error?: string }).error || 'Unknown error';
-    throw new Error(`URL discovery failed: ${errorMessage}`);
+    throw new UpstreamHttpError({
+      service: 'crawler',
+      endpoint: '/api/v1/urls/discover',
+      status: 200,
+      bodySnippet: `URL discovery failed: ${errorMessage}`,
+      retryable: false,
+      safeMessage: 'Crawler URL discovery failed.',
+    });
   }
 
   debugLog(
@@ -220,7 +232,14 @@ async function fetchUrls(
     const errorMessage =
       // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- dynamic data
       (result as { error?: string }).error || 'Unknown error';
-    throw new Error(`URL fetch failed: ${errorMessage}`);
+    throw new UpstreamHttpError({
+      service: 'crawler',
+      endpoint: '/api/v1/urls/fetch',
+      status: 200,
+      bodySnippet: `URL fetch failed: ${errorMessage}`,
+      retryable: false,
+      safeMessage: 'Crawler URL fetch failed.',
+    });
   }
 
   debugLog(
diff --git a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
index 2a07cbc62b..40a8025ba3 100644
--- a/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
+++ b/services/platform/convex/workflow_engine/action_defs/document/helpers/apply_docx_structured.ts
@@ -135,7 +135,7 @@ export async function applyDocxStructured(
       'crawler',
       response,
       errorText,
-      '/api/v1/apply-structured',
+      '/api/v1/docx/apply-structured',
     );
   }
 

From 2b5b0e9027468fe2ae19fa4b6272f0cec1209adc Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:46:14 +0800
Subject: [PATCH 38/41] =?UTF-8?q?feat(cli):=20round-3=20=E2=80=94=20auto-d?=
 =?UTF-8?q?etect=20+=20interactive=20legacy-layout=20migration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recall `feedback_migration_ux.md`: the surface UX for one-shot
migrations should be (1) the user just runs their normal command,
(2) the command auto-detects the legacy state and asks for confirm
inline, (3) `--yes` skips the prompt for CI, and (4) no per-migration
`tale migrate <name>` subcommand. Previously `tale start`, `tale
deploy`, and `tale update` hard-failed on the pre-org-first flat
layout and pointed the operator at `tale migrate config-layout`
followed by another `tale deploy --override-all -y` — three commands
where one prompt should suffice.

New `legacy-layout-preflight.ts` wraps detect → confirm → migrate as
a single entry point and is called from start / deploy / update. TTY
+ no `--yes` prompts; non-TTY + no `--yes` throws a clear actionable
error; `--yes` migrates without prompting. `update` runs the preflight
BEFORE writing new `default/<domain>/` files so a legacy project no
longer ends up half-migrated and dead-locked on the next `tale start`.

`tale start` gains a `--yes / -y` flag (parallels `tale deploy`).

`tale migrate` is deprecated: the no-flag form prints a clear redirect
to `tale start --yes` and exits non-zero; `--cleanup-old` stays
available as the optional post-migration housekeeping step that
byte-for-byte verifies new paths and removes the rollback-insurance
copies. The forward migration is now automatic.

Same commit folds in two CLI safety items that share the
`reseed-all-orgs.ts` / `deploy.ts` surface:

reseed-all-orgs.ts:
  - Move the `if (dryRun)` gate ABOVE the destructive `confirm()` and
    `findPlatformContainer()` lookup. `--dry-run` was both prompting
    the operator and hard-throwing on hosts without a running platform
    container, defeating its preview-only point.
  - Broaden `ADMIN_KEY_RE` to also catch the hyphenated argv form
    (`--admin-key <value>`) that the bash heredoc itself contains.
    A future Convex CLI line echoing its argv would otherwise slip
    the secret past the redactor.

deploy.ts:
  chown failure after a host push is now a hard error instead of a
  warning. Previously `Overrode N orgs` printed green over a root-owned
  volume that the app user couldn't write to, sending operators into a
  debugging maze when later writes silently failed inside the container.
---
 tools/cli/src/commands/migrate.ts             |  49 +++++--
 tools/cli/src/commands/start/index.ts         |  71 +++++-----
 tools/cli/src/lib/actions/deploy.ts           |  45 +++----
 .../lib/actions/legacy-layout-preflight.ts    | 123 ++++++++++++++++++
 tools/cli/src/lib/actions/reseed-all-orgs.ts  |  45 +++++--
 tools/cli/src/lib/actions/start.ts            |  35 +++--
 tools/cli/src/lib/actions/update.ts           |  20 +++
 7 files changed, 285 insertions(+), 103 deletions(-)
 create mode 100644 tools/cli/src/lib/actions/legacy-layout-preflight.ts

diff --git a/tools/cli/src/commands/migrate.ts b/tools/cli/src/commands/migrate.ts
index 56d498a617..5d4af5ab18 100644
--- a/tools/cli/src/commands/migrate.ts
+++ b/tools/cli/src/commands/migrate.ts
@@ -5,32 +5,63 @@ import { requireProject } from '../lib/project/find-project';
 import { resolveProjectContext } from '../lib/project/project-context';
 import * as logger from '../utils/logger';
 
+/**
+ * `tale migrate` is deprecated as a user-facing surface — `tale start`,
+ * `tale deploy`, and `tale update` now auto-detect the legacy flat
+ * layout and prompt the operator inline (see
+ * `lib/actions/legacy-layout-preflight.ts`).
+ *
+ * What's preserved:
+ *   - `tale migrate config-layout --cleanup-old` stays available
+ *     (hidden from `--help`) as the optional post-migration
+ *     housekeeping step that byte-for-byte verifies the new paths
+ *     and removes the rollback-insurance copies under the old
+ *     per-domain dirs in the convex container's $DATA volume. The
+ *     forward migration runs automatically, but the cleanup is a
+ *     deliberate operator-pull action.
+ *   - `tale migrate config-layout` (no-flag) prints a deprecation
+ *     notice pointing at `tale start --yes` and exits non-zero, so
+ *     scripts and CI pipelines that still call it surface clearly
+ *     during the deprecation window.
+ */
 export function createMigrateCommand(): Command {
-  const migrateCmd = new Command('migrate').description(
-    'One-shot, manually-run config migrations',
-  );
+  const migrateCmd = new Command('migrate')
+    .description(
+      '[deprecated] Forward migration runs automatically on tale start / deploy / update. ' +
+        '`config-layout --cleanup-old` remains for post-migration housekeeping.',
+    )
+    .helpOption(false);
 
   migrateCmd
     .command('config-layout')
     .description(
-      'Relocate providers/*.secrets.json from the legacy per-domain layout ' +
-        'to the org-first layout. Idempotent; copies (not moves) so old paths ' +
-        'remain readable until --cleanup-old runs.',
+      'Forward migration: deprecated (`tale start --yes` now runs it). ' +
+        '--cleanup-old: byte-for-byte verify new paths, then remove rollback-insurance copies.',
     )
     .option('--dry-run', 'Preview moves without changing files', false)
     .option(
       '--cleanup-old',
-      'After verifying new == old (byte-for-byte), remove the old-path ' +
-        'secrets. Run only after the new deployment is healthy.',
+      'After verifying new == old (byte-for-byte), remove the old-path secrets. ' +
+        'Run only after the new deployment is healthy.',
       false,
     )
     .action(async (opts: { dryRun?: boolean; cleanupOld?: boolean }) => {
       try {
+        if (!opts.cleanupOld) {
+          logger.error(
+            '`tale migrate config-layout` (without --cleanup-old) is deprecated. ' +
+              'The forward migration now runs automatically when `tale start`, ' +
+              '`tale deploy --override-all`, or `tale update` detects a legacy ' +
+              'layout. Re-run one of those commands; add `--yes` for non-interactive ' +
+              'environments.',
+          );
+          process.exit(2);
+        }
         const projectDir = requireProject();
         await resolveProjectContext(projectDir);
         await migrateConfigLayout({
           dryRun: opts.dryRun ?? false,
-          cleanupOld: opts.cleanupOld ?? false,
+          cleanupOld: true,
           projectDir,
         });
       } catch (err) {
diff --git a/tools/cli/src/commands/start/index.ts b/tools/cli/src/commands/start/index.ts
index 6b4bc54efa..70e4ddbffa 100644
--- a/tools/cli/src/commands/start/index.ts
+++ b/tools/cli/src/commands/start/index.ts
@@ -4,44 +4,35 @@ import { start } from '../../lib/actions/start';
 import * as logger from '../../utils/logger';
 
 export function createStartCommand(): Command {
-  return (
-    new Command('start')
-      .description('Start Tale platform locally with project files')
-      .option('-d, --detach', 'run in background')
-      .option('-p, --port <port>', 'HTTPS port to expose', '443')
-      .option('--host <hostname>', 'host alias for proxy', 'tale.local')
-      // Hidden back-compat: `tale start -y` used to skip migration prompts.
-      // The auto-migration framework is gone but operator CI scripts may
-      // still pass `-y`. Accept and ignore for one release, then remove.
-      .addOption(
-        new Option(
-          '-y, --yes',
-          '[deprecated] no longer needed (auto-migrations removed); ignored',
-        ).hideHelp(),
-      )
-      .action(
-        async (opts: {
-          detach?: boolean;
-          port: string;
-          host: string;
-          yes?: boolean;
-        }) => {
-          try {
-            if (opts.yes) {
-              logger.warn(
-                '--yes/-y is deprecated on `tale start` and ignored; safe to remove from scripts.',
-              );
-            }
-            await start({
-              detach: opts.detach,
-              port: Number(opts.port),
-              host: opts.host,
-            });
-          } catch (err) {
-            logger.error(err instanceof Error ? err.message : String(err));
-            process.exit(1);
-          }
-        },
-      )
-  );
+  return new Command('start')
+    .description('Start Tale platform locally with project files')
+    .option('-d, --detach', 'run in background')
+    .option('-p, --port <port>', 'HTTPS port to expose', '443')
+    .option('--host <hostname>', 'host alias for proxy', 'tale.local')
+    .addOption(
+      new Option(
+        '-y, --yes',
+        'non-interactive: auto-accept the legacy config-layout migration when detected',
+      ),
+    )
+    .action(
+      async (opts: {
+        detach?: boolean;
+        port: string;
+        host: string;
+        yes?: boolean;
+      }) => {
+        try {
+          await start({
+            detach: opts.detach,
+            port: Number(opts.port),
+            host: opts.host,
+            assumeYes: opts.yes,
+          });
+        } catch (err) {
+          logger.error(err instanceof Error ? err.message : String(err));
+          process.exit(1);
+        }
+      },
+    );
 }
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 332609c6ee..30316ad2a7 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -34,6 +34,7 @@ import { getNextColor } from '../state/get-next-color';
 import { setCurrentColor } from '../state/set-current-color';
 import { setPreviousVersion } from '../state/set-previous-version';
 import { withLock } from '../state/with-lock';
+import { legacyLayoutPreflight } from './legacy-layout-preflight';
 import { reseedAllOrgsFromBuiltin } from './reseed-all-orgs';
 
 async function ensureInfrastructure(
@@ -145,24 +146,19 @@ export async function deploy(options: DeployOptions): Promise<void> {
       const prefix = dryRun ? '[DRY-RUN] ' : '';
       logger.header(`${prefix}Deploying Tale ${version}`);
 
-      // Auto-migration framework removed — `tale migrate config-layout` is
-      // the only opt-in, manually-run migration now. Fail fast on the
-      // pre-refactor flat layout — but ONLY when the operator is actually
-      // pushing host config (`--override` or `--override-all`). Plain
-      // `tale deploy` (container rotation, image pull only) has no host-
-      // push hazard, so trapping operators with legacy artifacts on a
-      // no-op deploy was over-broad. The host-push code path at
-      // syncProjectFiles enforces the same check where it matters
-      // (round-2 P1-32).
+      // Detect-and-migrate on legacy flat layout. Only gates host
+      // pushes (`--override` / `--override-all`) — a plain container-
+      // rotation deploy has no host-config dependency. The preflight
+      // prompts (default-No) and runs `migrateConfigLayout` in place
+      // on accept; CI / `--yes` migrates without prompting. Replaces
+      // the prior hard-fail-with-runbook flow so legacy projects can
+      // be upgraded in one command.
       if (options.override || options.overrideAll) {
-        const { legacyDirs } = await findOrgDirs(env.DEPLOY_DIR);
-        if (legacyDirs.length > 0) {
-          throw new Error(
-            `Legacy flat layout detected at project root (${legacyDirs.join(', ')}/). ` +
-              `Run 'tale migrate config-layout' then 'tale deploy --override-all -y' ` +
-              `(see docs/self-hosted/operate/upgrades.md).`,
-          );
-        }
+        await legacyLayoutPreflight({
+          projectDir: env.DEPLOY_DIR,
+          assumeYes: options.assumeYes ?? false,
+          context: 'deploy',
+        });
       }
 
       // Check if this is a first-time deployment
@@ -801,11 +797,16 @@ async function syncProjectFiles(
       `/app/data/`,
     ]);
     if (!chownResult.success) {
-      // Ownership fix failure isn't necessarily a push failure (files
-      // landed, just wrong owner), but warn loudly — the app user won't
-      // be able to write to its own data tree.
-      logger.warn(
-        `Failed to fix ownership on /app/data: ${chownResult.stderr}`,
+      // Hard fail: files landed but the app user can't write them.
+      // Printing `Overrode N orgs!` while the volume is root-owned
+      // sent operators into a debugging maze when later writes failed
+      // silently inside the container. The push is recoverable — they
+      // can re-run --override after fixing the cause — but a quiet
+      // wrong-perms state is not.
+      throw new Error(
+        `Failed to fix ownership on /app/data after push: ${chownResult.stderr?.trim() ?? '(no stderr)'}. ` +
+          `The push completed but files are root-owned and the app user can't write to them. ` +
+          `Re-run --override after addressing the cause.`,
       );
     }
 
diff --git a/tools/cli/src/lib/actions/legacy-layout-preflight.ts b/tools/cli/src/lib/actions/legacy-layout-preflight.ts
new file mode 100644
index 0000000000..dcd488bb24
--- /dev/null
+++ b/tools/cli/src/lib/actions/legacy-layout-preflight.ts
@@ -0,0 +1,123 @@
+/**
+ * Legacy flat-layout preflight: detect-and-migrate gate for `tale start`,
+ * `tale deploy`, and `tale update`.
+ *
+ * Pre-Oct-2025 projects keep per-domain dirs (`agents/`, `workflows/`, …)
+ * at the project root. The org-first layout expects them under
+ * `default/<dir>/`. `migrateConfigLayout` already implements the move
+ * idempotently with a host-side phase + a container-side phase.
+ *
+ * Per `feedback_migration_ux.md`, the surface UX should be:
+ *
+ *   1. The user just runs `tale start` / `tale deploy` / `tale update`.
+ *   2. If a legacy layout is detected, prompt with a default-No confirm
+ *      that summarises what will move.
+ *   3. On accept, run the migration in-line; on decline, abort.
+ *   4. On non-TTY (CI, scripts), the user must opt in via `--yes` —
+ *      otherwise we abort with a clear message rather than silently
+ *      migrating.
+ *
+ * No `tale migrate config-layout` user-facing subcommand. The library
+ * function `migrateConfigLayout` stays so this preflight (and tests)
+ * can call it.
+ */
+
+import { existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+import { confirm } from '../../utils/confirm';
+import * as logger from '../../utils/logger';
+import { LEGACY_DOMAIN_DIR_NAMES } from './deploy';
+import { migrateConfigLayout } from './migrate-config-layout';
+
+export interface LegacyLayoutPreflightOptions {
+  /** Absolute path of the project root to scan. */
+  projectDir: string;
+  /** Skip the prompt and migrate immediately (non-interactive flag). */
+  assumeYes: boolean;
+  /**
+   * The command that triggered this preflight. Only shapes the prompt
+   * copy — the migration steps are identical across commands.
+   */
+  context: 'start' | 'deploy' | 'update';
+}
+
+export interface LegacyLayoutPreflightResult {
+  /** True iff a migration was actually performed in this call. */
+  migrated: boolean;
+}
+
+function isInteractive(): boolean {
+  return Boolean(process.stdin.isTTY);
+}
+
+function detectLegacyDirs(projectDir: string): string[] {
+  return [...LEGACY_DOMAIN_DIR_NAMES].filter((d) =>
+    existsSync(join(projectDir, d)),
+  );
+}
+
+/**
+ * Run the detect → confirm → migrate flow. Throws when:
+ *   - legacy layout exists but stdin is not a TTY and `--yes` was not
+ *     supplied (operator must opt in explicitly in CI / scripts);
+ *   - the user declines the interactive prompt;
+ *   - the migration itself fails (e.g. host conflicts).
+ *
+ * Returns `{ migrated: true }` on a successful migration so the caller
+ * can re-evaluate downstream state (`tale update` re-reads checksums
+ * after the move, for example). Returns `{ migrated: false }` when no
+ * legacy dirs were present.
+ */
+export async function legacyLayoutPreflight(
+  options: LegacyLayoutPreflightOptions,
+): Promise<LegacyLayoutPreflightResult> {
+  const { projectDir, assumeYes, context } = options;
+  const legacyDirs = detectLegacyDirs(projectDir);
+  if (legacyDirs.length === 0) {
+    return { migrated: false };
+  }
+
+  const dirsList = legacyDirs.map((d) => `${d}/`).join(', ');
+
+  // Non-TTY + no --yes: fail loud rather than silently migrate. A CI
+  // pipeline that hits this case should add the flag deliberately.
+  if (!assumeYes && !isInteractive()) {
+    throw new Error(
+      `Legacy flat layout detected at project root: ${dirsList}\n` +
+        `  The org-first layout expects these under "default/<domain>/".\n` +
+        `  Re-run \`tale ${context} --yes\` to migrate in place,\n` +
+        '  or move the dirs into `default/` manually. See ' +
+        'docs/en/self-hosted/operate/upgrades.md for the runbook.',
+    );
+  }
+
+  // Interactive + no --yes: ask. Default-No so a single Enter keystroke
+  // doesn't trigger a destructive-shape operation.
+  if (!assumeYes) {
+    logger.blank();
+    logger.warn(`Legacy flat layout detected at project root: ${dirsList}`);
+    logger.info('  These dirs will move into "default/<dir>/" in place.');
+    logger.info(
+      '  The migration is rollback-insured: container-side providers/secrets ' +
+        'are copied (not moved) and cleaned up only after you run ' +
+        '`tale start` against the new layout. Host dirs are renamed atomically.',
+    );
+    const ok = await confirm('Migrate now?');
+    if (!ok) {
+      throw new Error(
+        `Aborted: legacy layout still present. Re-run \`tale ${context}\` after migrating ` +
+          'manually, or pass `--yes` to migrate non-interactively.',
+      );
+    }
+  }
+
+  logger.blank();
+  logger.step('Running config-layout migration...');
+  await migrateConfigLayout({
+    dryRun: false,
+    cleanupOld: false,
+    projectDir,
+  });
+  return { migrated: true };
+}
diff --git a/tools/cli/src/lib/actions/reseed-all-orgs.ts b/tools/cli/src/lib/actions/reseed-all-orgs.ts
index 48b89eb1fb..88612175f0 100644
--- a/tools/cli/src/lib/actions/reseed-all-orgs.ts
+++ b/tools/cli/src/lib/actions/reseed-all-orgs.ts
@@ -93,10 +93,23 @@ HOME=/home/app timeout ${RESEED_TIMEOUT_S} bunx convex run \\
  * up to the first `|`, leaving the secret payload after it in the
  * logged stream (round-3 P1-adjacent secret leak).
  */
-const ADMIN_KEY_RE = /\b([Aa]dmin\s+[Kk]ey)\s*:?\s*[A-Za-z0-9+/=._\-|]{12,}/g;
+const ADMIN_KEY_RE =
+  /\b([Aa]dmin[\s\-_][Kk]ey)\s*[:=]?\s*[A-Za-z0-9+/=._\-|]{12,}/g;
+
+/**
+ * Catch the hyphenated argv form (`--admin-key <value>`) used by
+ * `bunx convex run --admin-key …`. The `[Aa]dmin\s+[Kk]ey` shape
+ * above requires whitespace between "Admin" and "Key" and so misses
+ * `--admin-key`. Without this second pattern, a future Convex CLI
+ * diagnostic line echoing its argv would slip the secret past the
+ * redactor and into the logger.
+ */
+const ADMIN_KEY_ARG_RE = /--admin-key([\s=]+)\S+/g;
 
 export function redactAdminKey(text: string): string {
-  return text.replace(ADMIN_KEY_RE, '$1: <redacted>');
+  return text
+    .replace(ADMIN_KEY_RE, '$1: <redacted>')
+    .replace(ADMIN_KEY_ARG_RE, '--admin-key$1<redacted>');
 }
 
 const CONFIRM_MESSAGE =
@@ -163,6 +176,23 @@ export async function reseedAllOrgsFromBuiltin(
 ): Promise<void> {
   const { dryRun, assumeYes } = options;
 
+  // Dry-run gate sits BEFORE the destructive confirm prompt + the
+  // platform-container lookup. Otherwise `tale deploy --override-all
+  // --dry-run` would (a) still ask the operator to confirm a
+  // destructive-shape operation that won't run, and (b) hard-throw
+  // on hosts where no platform container is up yet — defeating the
+  // point of a dry-run preview.
+  if (dryRun) {
+    logger.blank();
+    logger.info(
+      '[DRY-RUN] Would run reseed script against the platform container:',
+    );
+    for (const line of RESEED_SCRIPT.split('\n')) {
+      logger.info(`  ${line}`);
+    }
+    return;
+  }
+
   // Gate non-interactive callers behind --yes to avoid silent abort in CI.
   const isTty = Boolean(process.stdin.isTTY);
   if (!assumeYes && !isTty) {
@@ -180,17 +210,6 @@ export async function reseedAllOrgsFromBuiltin(
 
   const container = await findPlatformContainer();
 
-  if (dryRun) {
-    logger.blank();
-    logger.info('[DRY-RUN] Would run:');
-    logger.info(`  docker exec -i ${container} bash -s <<'EOF'`);
-    for (const line of RESEED_SCRIPT.split('\n')) {
-      logger.info(`  ${line}`);
-    }
-    logger.info(`  EOF`);
-    return;
-  }
-
   logger.blank();
   logger.step('Reseeding builtin catalog into all registered orgs...');
 
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index fa81189a31..dc93ef35d3 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -1,4 +1,3 @@
-import { existsSync } from 'node:fs';
 import { join } from 'node:path';
 
 import pkg from '../../../package.json';
@@ -16,8 +15,8 @@ import { exec } from '../docker/exec';
 import { findProject } from '../project/find-project';
 import { resolveOrAssignProjectContext } from '../project/project-context';
 import { withLock } from '../state/with-lock';
-import { LEGACY_DOMAIN_DIR_NAMES } from './deploy';
 import { init } from './init';
+import { legacyLayoutPreflight } from './legacy-layout-preflight';
 
 async function assertDockerAvailable(): Promise<void> {
   try {
@@ -121,6 +120,12 @@ interface StartOptions {
   detach?: boolean;
   port?: number;
   host?: string;
+  /**
+   * Non-interactive: auto-accept the legacy-layout migration prompt
+   * when a pre-org-first project root is detected. Parallels the
+   * `--yes` flag on `tale deploy`.
+   */
+  assumeYes?: boolean;
 }
 
 export async function start(options: StartOptions): Promise<void> {
@@ -151,23 +156,15 @@ export async function start(options: StartOptions): Promise<void> {
   // Detect legacy flat-layout dirs at the project root (`agents/`,
   // `workflows/`, …, `retention/`). Under the org-first layout these
   // belong under `default/<domain>/` — the platform's resolvers won't
-  // read anything at the old paths. Same constant + same hard-fail as
-  // `tale deploy`: both commands either accept or refuse the layout
-  // identically. (Earlier this file warn-and-proceeded, which let a
-  // project pass `tale start` but fail `tale deploy`.)
-  const legacyDirsFound = [...LEGACY_DOMAIN_DIR_NAMES].filter((d) =>
-    existsSync(join(projectDir, d)),
-  );
-  if (legacyDirsFound.length > 0) {
-    throw new Error(
-      `Legacy flat layout detected at project root: ${legacyDirsFound
-        .map((d) => `${d}/`)
-        .join(', ')}\n` +
-        '  The org-first layout expects these under `default/<domain>/` (or another org subtree).\n' +
-        '  Migrate with: `tale migrate config-layout` then `tale deploy --override-all -y`.\n' +
-        '  See docs/en/self-hosted/operate/upgrades.md for the full runbook.',
-    );
-  }
+  // read anything at the old paths. The preflight prompts the operator
+  // (default-No) and runs `migrateConfigLayout` in place on accept; CI
+  // runs must pass `--yes`. Replaces the prior hard-fail-with-runbook
+  // shape so an upgrade flows in one command.
+  await legacyLayoutPreflight({
+    projectDir,
+    assumeYes: options.assumeYes ?? false,
+    context: 'start',
+  });
 
   await assertDockerAvailable();
 
diff --git a/tools/cli/src/lib/actions/update.ts b/tools/cli/src/lib/actions/update.ts
index a6c879c287..e95a49a294 100644
--- a/tools/cli/src/lib/actions/update.ts
+++ b/tools/cli/src/lib/actions/update.ts
@@ -21,11 +21,17 @@ import { readProject } from '../project/read-project';
 import type { Checksums } from '../project/types';
 import { writeProject } from '../project/write-project';
 import { generateAllRules } from '../rules/generators';
+import { legacyLayoutPreflight } from './legacy-layout-preflight';
 
 interface UpdateOptions {
   force?: boolean;
   dryRun?: boolean;
   skipHeader?: boolean;
+  /**
+   * Non-interactive: auto-accept the legacy-layout migration prompt
+   * when a pre-org-first project root is detected.
+   */
+  assumeYes?: boolean;
 }
 
 interface UpdateSummary {
@@ -51,6 +57,20 @@ export async function update(options: UpdateOptions): Promise<void> {
   logger.info(`Current version: ${project.cliVersion}`);
   logger.info(`Target version:  ${pkg.version}`);
 
+  // If the project is on the pre-org-first layout, migrate now (before
+  // we write any new `default/<domain>/...` files). Without this gate
+  // `tale update` happily lays the new tree down next to the legacy
+  // dirs, and the subsequent `tale start` then refuses to boot — a
+  // user-visible deadlock. The preflight prompts in interactive runs
+  // and requires `--yes` in non-TTY contexts.
+  if (!options.dryRun) {
+    await legacyLayoutPreflight({
+      projectDir,
+      assumeYes: options.assumeYes ?? false,
+      context: 'update',
+    });
+  }
+
   // Legacy projects (pre-ID) get an ID auto-assigned here. We also attempt
   // volume migration immediately, but the migration function itself defers
   // (via a marker file) if any legacy containers are running, so production

From 217b1d5aa1a2a183d32d37dc498ab88bcc9bf3f5 Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:46:37 +0800
Subject: [PATCH 39/41] =?UTF-8?q?fix(convex):=20round-3=20=E2=80=94=20surf?=
 =?UTF-8?q?ace=20migration=20cp=20errors,=20snapshot=20before=20force-seed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`scripts/2026-03-28-migrate-convex-data.sh:108` — drop the trailing
`|| true` on the `cp -rn` step. The empty-glob edge case is already
covered by the per-dir empty check above, so `|| true` was only
swallowing real I/O errors (disk-full, EACCES). The previous shape
exited 0 reporting "N new items copied" while the migration was
silently incomplete; now a real `cp` failure aborts via `set -e`.

`services/convex/docker-entrypoint.sh::atomic_cp_bundle` —
`FORCE_SEED=true` unconditionally `rm -rf`s the destination bundle
before rename, which loses any operator-added files inside an
integration or skill bundle (custom_state.json, scripts the operator
dropped in, …). Take a timestamped snapshot to
`<dest>.history/<ts>/` before the rm so the pre-force tree is
recoverable. Snapshot failure is best-effort and doesn't block the seed.

`services/convex/docker-entrypoint.sh` workflow-seed loop — convert the
`find | while` pipeline (which ran the loop body in a subshell where
`log_error` could not bump an aggregate counter) to process
substitution. Track `workflows_failed` in the parent shell and
`log_warn` an aggregate count at the end. Disk-full mid-seed now
surfaces visibly instead of returning a clean exit code with a
half-seeded `default/workflows/`.
---
 scripts/2026-03-28-migrate-convex-data.sh | 14 ++++++----
 services/convex/docker-entrypoint.sh      | 34 +++++++++++++++++++++--
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/scripts/2026-03-28-migrate-convex-data.sh b/scripts/2026-03-28-migrate-convex-data.sh
index fecb882c30..62801c9e44 100755
--- a/scripts/2026-03-28-migrate-convex-data.sh
+++ b/scripts/2026-03-28-migrate-convex-data.sh
@@ -100,12 +100,14 @@ if [ "$old_exists" = true ]; then
 
         before=$(ls "$dst" 2>/dev/null | wc -l)
         # `cp -rn` is no-clobber, so re-runs are no-ops on already-
-        # copied trees. Earlier this swallowed stderr unconditionally,
-        # which hid disk-full / permission-denied as "0 new items".
-        # `|| true` is kept only to tolerate the "no files to copy"
-        # edge case (matched glob with no entries) without aborting
-        # `set -e`; real I/O errors now surface on stderr.
-        cp -rn "$src/"* "$dst/" || true
+        # copied trees. The earlier `|| true` swallowed real I/O
+        # failures (disk-full, EACCES, EIO) — the script would echo
+        # "0 new items copied" and exit 0 while the migration was
+        # silently incomplete. The empty-src guard above already
+        # handles the "no files to copy" edge case, so `|| true` is
+        # not needed for set -e correctness. Drop it and let real
+        # cp failures abort.
+        cp -rn "$src/"* "$dst/"
         after=$(ls "$dst" | wc -l)
         added=$((after - before))
 
diff --git a/services/convex/docker-entrypoint.sh b/services/convex/docker-entrypoint.sh
index aba859b2fb..52ed184cb9 100755
--- a/services/convex/docker-entrypoint.sh
+++ b/services/convex/docker-entrypoint.sh
@@ -352,11 +352,28 @@ atomic_cp() {
 # dest. cp -r alone leaves a half-populated dest on interruption, and
 # the next-run `[ -d "$dest" ]` check then treats the partial bundle
 # as "already seeded" and skips it permanently. Round-3 P2 R32-P2-c.
+#
+# `FORCE_SEED=true` is destructive for bundle dirs (skills, integrations,
+# branding sub-bundles): the existing dest is removed wholesale before
+# the new one lands. If the operator added custom files inside a bundle
+# (e.g. an integration's `custom_state.json`), force-seed would wipe
+# them. Take a timestamped snapshot under `<dest>.history/<ts>/` first
+# so the operator can recover the pre-force tree. Best-effort — a
+# snapshot failure must not block the seed.
 atomic_cp_bundle() {
   local src_dir="$1" dest_dir="$2"
   local stage="${dest_dir}.tale-seed.$$"
   rm -rf "$stage"
   cp -r "$src_dir" "$stage"
+  if [ -d "$dest_dir" ]; then
+    local snapshot_dir="${dest_dir}.history/$(date -u +%Y%m%dT%H%M%SZ)"
+    if mkdir -p "$snapshot_dir" 2>/dev/null && cp -r "$dest_dir/." "$snapshot_dir/" 2>/dev/null; then
+      echo "   ↻ Snapshotted previous bundle to ${snapshot_dir}"
+    else
+      echo "   ⚠ Could not snapshot previous bundle at ${dest_dir} (proceeding with force-seed anyway)"
+      rm -rf "$snapshot_dir" 2>/dev/null || true
+    fi
+  fi
   rm -rf "$dest_dir"
   mv "$stage" "$dest_dir"
 }
@@ -392,7 +409,15 @@ run_seed() {
   local workflows_builtin="/app/builtin/default/workflows"
   mkdir -p "$workflows_dir"
   if [ -d "$workflows_builtin" ] && [ "$(ls -A "$workflows_builtin" 2>/dev/null)" ]; then
-    find "$workflows_builtin" -name '*.json' -type f | while read -r src; do
+    # Aggregate per-file failure count. Previously the seed loop ran
+    # inside a `find | while` subshell — `log_error` printed to stderr
+    # but no aggregate counter survived past the subshell, so boot
+    # would silently complete with a partially-seeded workflows/ dir
+    # (disk-full mid-seed = boot succeeds, broken builtin workflows).
+    # Use process substitution so the counter lives in the parent
+    # shell.
+    local workflows_failed=0
+    while IFS= read -r src; do
       local rel_path="${src#$workflows_builtin/}"
       local dest="$workflows_dir/$rel_path"
       local dest_dir="$(dirname "$dest")"
@@ -410,6 +435,7 @@ run_seed() {
           echo "   ✓ Seeded workflow $rel_path (forced)"
         else
           log_error "   ✗ Failed to seed workflow $rel_path (forced)"
+          workflows_failed=$((workflows_failed + 1))
         fi
         continue
       fi
@@ -421,8 +447,12 @@ run_seed() {
         echo "   ✓ Seeded workflow $rel_path"
       else
         log_error "   ✗ Failed to seed workflow $rel_path"
+        workflows_failed=$((workflows_failed + 1))
       fi
-    done
+    done < <(find "$workflows_builtin" -name '*.json' -type f)
+    if [ "$workflows_failed" -gt 0 ]; then
+      log_warn "workflow seed: $workflows_failed file(s) failed to seed; check stderr for details"
+    fi
   fi
 
   # --- Integrations (directory bundles) ---

From 76c612ea84037d412400f2e0f6cb98790fca40ca Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 11:46:52 +0800
Subject: [PATCH 40/41] =?UTF-8?q?fix(rag,crawler):=20round-3=20=E2=80=94?=
 =?UTF-8?q?=20shutdown=20reset,=20fullmatch,=20vision=20drain=20+=20marker?=
 =?UTF-8?q?=20drop?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rag_service.initialize:
  Reset `_shutting_down` and clear the module-level shutdown event at
  the top of `initialize()`. Re-init after a prior `shutdown()` (tests,
  supervisor restart with the same singleton) was leaving the
  "shutting down" state set, so every subsequent `_ensure_org_clients`
  call permanently raised `RuntimeError("RagService is shutting down")`
  despite the pool being back. Also delete the dead `embedding_service`
  property (zero external readers confirmed by grep).

crawler/org_context.py:
  Switch the X-Tale-Org regex check from `re.match` to `re.fullmatch`.
  Python's `$` accepts a trailing `\n`, so `match()` silently accepted
  CRLF-smuggled slugs like `"acme\n"`. The RAG-side `auth.py` already
  uses `fullmatch`; mirror that semantic here.

crawler/services/vision/openai_client.py:
  - LLM-failure path no longer prepends a `[LLM_EXTRACTION_FAILED:...]`
    marker + raw chunk text to the returned page content. The marker
    string was flowing through embeddings → BM25 index → search hits
    as user-visible content, and the raw fallback text was poisoning
    relevance since the LLM step's structural extraction was missing.
    Drop the chunk entirely on failure; the error log is the
    operator-visible signal.
  - Track outstanding `_safe_close_client` tasks in
    `_PENDING_CLOSE_TASKS` and expose `drain_pending_close_tasks()` so
    lifespan shutdown can cancel + await them. Previously evicted /
    rotated clients sat in a 300 s sleep that the event loop closed
    underneath at shutdown, leaking the httpx connection pool and
    producing "Event loop is closed" tracebacks.

crawler/main.py:
  Lifespan teardown calls `drain_pending_close_tasks()` (bounded by 10s
  timeout) right after the per-org cache drain so the new pending-task
  set is flushed before the event loop exits.
---
 services/crawler/app/main.py                  | 11 +++
 services/crawler/app/org_context.py           |  6 +-
 .../app/services/vision/openai_client.py      | 83 +++++++++++++++----
 services/rag/app/services/rag_service.py      | 17 ++--
 4 files changed, 94 insertions(+), 23 deletions(-)

diff --git a/services/crawler/app/main.py b/services/crawler/app/main.py
index 4c86154395..cb6772d210 100644
--- a/services/crawler/app/main.py
+++ b/services/crawler/app/main.py
@@ -188,6 +188,7 @@ async def _drain_org_caches() -> None:
         from app.services.vision.openai_client import (  # type: ignore[attr-defined]
             _chat_states,
             _vision_states,
+            drain_pending_close_tasks,
         )
 
         async def _safe(close_aw):
@@ -214,6 +215,16 @@ async def _safe(close_aw):
                 )
             except TimeoutError:
                 logger.warning("Per-org client drain did not finish within 10s; continuing")
+        # Cancel + await any outstanding `_safe_close_client` tasks
+        # left over from earlier LRU evictions / config rotations.
+        # Without this, those tasks sit in the loop sleeping out a 300s
+        # grace window and the event loop closes underneath them,
+        # leaking the httpx pool and producing "Event loop is closed"
+        # tracebacks (round-3 P2 R26-P2-c continuation).
+        try:
+            await asyncio.wait_for(drain_pending_close_tasks(), timeout=10)
+        except TimeoutError:
+            logger.warning("Pending close-task drain did not finish within 10s; continuing")
 
     try:
         await _drain_org_caches()
diff --git a/services/crawler/app/org_context.py b/services/crawler/app/org_context.py
index a707c876d4..9261be9c22 100644
--- a/services/crawler/app/org_context.py
+++ b/services/crawler/app/org_context.py
@@ -56,7 +56,11 @@ async def require_org_slug(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="missing X-Tale-Org header",
         )
-    if not ORG_SLUG_RE.match(x_tale_org):
+    # `fullmatch` (not `match`) so e.g. `"acme\n"` is rejected: Python's
+    # `$` anchor matches before a trailing newline, so `match()` would
+    # silently accept a CRLF-smuggled slug. RAG-side `auth.py` already
+    # uses `fullmatch`; mirror that semantic here.
+    if not ORG_SLUG_RE.fullmatch(x_tale_org):
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="invalid X-Tale-Org header",
diff --git a/services/crawler/app/services/vision/openai_client.py b/services/crawler/app/services/vision/openai_client.py
index 1a7ac2f075..55776efc1e 100644
--- a/services/crawler/app/services/vision/openai_client.py
+++ b/services/crawler/app/services/vision/openai_client.py
@@ -108,6 +108,46 @@ def __init__(
 _vision_states: OrderedDict[str, _OrgVisionState] = OrderedDict()
 _chat_states: OrderedDict[str, _OrgVisionState] = OrderedDict()
 
+# Track outstanding `_safe_close_client` tasks so lifespan shutdown can
+# drain them before the event loop closes. Without this set, an
+# evicted-or-rotated client sleeps for up to 300 s in a fire-and-forget
+# task; when the FastAPI lifespan exits, the loop closes underneath the
+# sleeping task, the close never fires, and httpx connection pools
+# leak (round-2 P1-26). `app/main.py` lifespan awaits this set on
+# shutdown via `drain_pending_close_tasks()`.
+_PENDING_CLOSE_TASKS: set[asyncio.Task] = set()
+
+
+def _schedule_safe_close(client: AsyncOpenAI) -> None:
+    """Fire-and-forget close with task-set bookkeeping."""
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        # No running loop (shutdown in progress, or called from outside
+        # an event loop). The caller treats this as best-effort; just
+        # close synchronously via a fresh loop would be unsafe, so log
+        # the leak instead. The aging pool is closed when the process
+        # exits anyway.
+        logger.warning(
+            "Could not schedule client close — no running event loop; pool will leak until process exit",
+        )
+        return
+    task = loop.create_task(_safe_close_client(client))
+    _PENDING_CLOSE_TASKS.add(task)
+    task.add_done_callback(_PENDING_CLOSE_TASKS.discard)
+
+
+async def drain_pending_close_tasks() -> None:
+    """Await every still-pending `_safe_close_client` so shutdown can
+    flush the 300s grace window without the event loop closing under
+    sleeping tasks. Called from `app/main.py` lifespan teardown."""
+    if not _PENDING_CLOSE_TASKS:
+        return
+    pending = list(_PENDING_CLOSE_TASKS)
+    for t in pending:
+        t.cancel()
+    await asyncio.gather(*pending, return_exceptions=True)
+
 
 async def _safe_close_client(client: AsyncOpenAI) -> None:
     """Close an old client after a grace period for in-flight requests.
@@ -117,8 +157,14 @@ async def _safe_close_client(client: AsyncOpenAI) -> None:
     `vision_request_timeout=180s` and chat completions can run for up
     to ~300s; 30s was too short and would tear down the httpx pool
     while a long PDF OCR was still in flight (round-3 P2 R26-P2-b).
+
+    On cancellation (lifespan shutdown drain), close immediately
+    without waiting out the grace — the process is exiting, so
+    in-flight requests will fail regardless and the FD leak is the
+    more important concern.
     """
-    await asyncio.sleep(300)
+    with contextlib.suppress(asyncio.CancelledError):
+        await asyncio.sleep(300)
     try:
         await client.close()
     except Exception:
@@ -135,13 +181,13 @@ def _evict_lru_if_needed(
     this, a typo'd-slug spray or a long-running process with high org
     churn slowly leaks file descriptors. Schedule the evicted client's
     close after the standard grace window so any in-flight call still
-    finishes (round-2 P1-25).
+    finishes (round-2 P1-25). The scheduling helper tracks the task
+    in `_PENDING_CLOSE_TASKS` so lifespan shutdown can drain it.
     """
     while len(states) > _ORG_CACHE_MAX:
         _victim_key, victim = states.popitem(last=False)
         logger.info("Evicting LRU {} client for org '{}'", label, _victim_key)
-        with contextlib.suppress(RuntimeError):
-            asyncio.get_running_loop().create_task(_safe_close_client(victim.client))
+        _schedule_safe_close(victim.client)
 
 
 def _get_or_build_client(
@@ -209,8 +255,7 @@ def _get_or_build_client(
 
     if old_client is not None:
         logger.info("{} rebuilt for org '{}': model={}", label, org_slug, model)
-        with contextlib.suppress(RuntimeError):
-            asyncio.get_running_loop().create_task(_safe_close_client(old_client))
+        _schedule_safe_close(old_client)
     else:
         logger.info("{} created for org '{}': model={}", label, org_slug, model)
 
@@ -548,18 +593,26 @@ async def process_chunk(chunk_idx: int, chunk_text: str) -> tuple[int, str]:
                 logger.info(f"LLM chunk {chunk_idx + 1}/{total_chunks} done: {len(chunk_text)} -> {len(result)} chars")
                 return chunk_idx, result
             except Exception as e:
-                # Round-3 P2 R26-P2-a: log at error level (was warning)
-                # and prepend an explicit failure marker so downstream
-                # storage / indexing can spot extractions that fell
-                # back to raw content. Previously the caller couldn't
-                # distinguish "LLM extracted this" from "LLM died,
-                # this is the raw input pretending to be extraction".
+                # Log loud + return empty string. Previously we returned
+                # `[LLM_EXTRACTION_FAILED: ...]\n` + raw chunk_text so
+                # downstream consumers could "spot the failure", but
+                # the marker travelled into embeddings + BM25 index +
+                # search results as user-visible content. Embedding
+                # the raw fallback text was worse — the unprocessed
+                # source carries none of the structure the LLM step
+                # was supposed to extract, so search relevance for
+                # those chunks regressed silently.
+                #
+                # The empty-string return drops the chunk entirely
+                # from the merged page text; the error log here is
+                # the operator-visible signal, and the caller's
+                # cache miss (no `set_llm`) means a retry will
+                # re-attempt extraction without poisoned state.
                 logger.error(
                     f"Failed to process chunk {chunk_idx + 1} with LLM ({type(e).__name__}: {e}); "
-                    f"returning raw content with failure marker",
+                    f"dropping chunk from output (no marker injected into indexed content)",
                 )
-                marker = f"[LLM_EXTRACTION_FAILED: {type(e).__name__}]\n"
-                return chunk_idx, marker + chunk_text
+                return chunk_idx, ""
 
     tasks = [process_chunk(idx, text) for idx, text in chunks]
     results = await asyncio.gather(*tasks)
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index 0c3816fc6f..a1694cc482 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -170,6 +170,14 @@ async def initialize(self) -> None:
         Per-org client construction is deferred until the first call for
         that org. The DB pool is global — all orgs share one
         knowledge-DB connection pool because the schema is global.
+
+        Re-init after a prior `shutdown()` is supported (tests, supervisor
+        restart with the same singleton). Reset `_shutting_down` and
+        clear the module-level shutdown event up front so the new
+        process state is "live" before any `_ensure_org_clients` checks
+        run — otherwise those would keep raising
+        `RuntimeError("RagService is shutting down")` forever despite
+        the pool being back.
         """
         if self.initialized:
             return
@@ -178,17 +186,12 @@ async def initialize(self) -> None:
             if self.initialized:
                 return
 
+            self._shutting_down = False
+            _get_shutdown_event().clear()
             self._pool = await init_pool()
             self.initialized = True
             logger.info("RagService initialized (DB pool ready; per-org clients lazy)")
 
-    @property
-    def embedding_service(self) -> EmbeddingService | None:
-        """Deprecated: kept for any callers that haven't been threaded
-        with `org_slug` yet. Returns None; callers must migrate.
-        """
-        return None
-
     def _get_org_lock(self, org_slug: str) -> asyncio.Lock:
         lock = self._org_locks.get(org_slug)
         if lock is not None:

From 0d7541c3f15b82d1364d467a06235c90b8680b1e Mon Sep 17 00:00:00 2001
From: larryro <371767072@qq.com>
Date: Fri, 29 May 2026 12:02:57 +0800
Subject: [PATCH 41/41] fix(cli): drop export on unused preflight interfaces
 (knip)

---
 tools/cli/src/lib/actions/legacy-layout-preflight.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cli/src/lib/actions/legacy-layout-preflight.ts b/tools/cli/src/lib/actions/legacy-layout-preflight.ts
index dcd488bb24..4ddf60e61e 100644
--- a/tools/cli/src/lib/actions/legacy-layout-preflight.ts
+++ b/tools/cli/src/lib/actions/legacy-layout-preflight.ts
@@ -30,7 +30,7 @@ import * as logger from '../../utils/logger';
 import { LEGACY_DOMAIN_DIR_NAMES } from './deploy';
 import { migrateConfigLayout } from './migrate-config-layout';
 
-export interface LegacyLayoutPreflightOptions {
+interface LegacyLayoutPreflightOptions {
   /** Absolute path of the project root to scan. */
   projectDir: string;
   /** Skip the prompt and migrate immediately (non-interactive flag). */
@@ -42,7 +42,7 @@ export interface LegacyLayoutPreflightOptions {
   context: 'start' | 'deploy' | 'update';
 }
 
-export interface LegacyLayoutPreflightResult {
+interface LegacyLayoutPreflightResult {
   /** True iff a migration was actually performed in this call. */
   migrated: boolean;
 }