From 8f9e20feb1c4713a8069087a5b4cef9966315098 Mon Sep 17 00:00:00 2001
From: Tyler Longwell <tlongwell@squareup.com>
Date: Mon, 18 May 2026 09:31:35 -0400
Subject: [PATCH 01/10] refactor(huddle): switch local TTS to Pocket TTS
 (Kyutai/sherpa-onnx)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the Kokoro-via-`ort` TTS path with Pocket TTS (Kyutai Labs,
Apache-2.0) running through the same `sherpa-onnx` static library already
used by STT. This removes the entire `ort`/`ort-sys`/`ndarray` direct
dependency tree (and the runtime `libonnxruntime.dylib` it shipped) in
exchange for one statically-linked onnxruntime shared with STT.

Headline wins
-------------
* No more `libonnxruntime.dylib` in the macOS bundle — eliminates the
  ad-hoc-signed dylib + gatekeeper-quarantine class of "damaged app" failures
  on older Macs. Verified by `find … '*onnx*'`, `find … '*.dylib'`, and
  `otool -L Contents/MacOS/Sprout` on a full unsigned `tauri build` artifact:
  zero hits, no dynamic ORT link.
* `cargo tree -i ort` / `-i ort-sys` / `-i ndarray` all return "package ID
  specification did not match any packages." Cargo.lock shrinks by 334 lines
  (those crates + their now-orphaned support deps).
* Net -735 LOC across the codebase (973-line `huddle/kokoro.rs` deleted, 238
  added in `huddle/pocket.rs`).

Tradeoff: CoreML
----------------
Pocket runs CPU-only through sherpa's statically-linked onnxruntime — the
upstream k2-fsa static archive does not bundle the CoreML execution
provider, and building one ourselves would re-introduce the same dylib
distribution problem we are deleting. Bench on M1 8GB shows ~5× realtime
warm synthesis (~680ms for a 75-char sentence) and ~289ms one-time engine
load, which is acceptable for huddle TTS without CoreML.

Voice
-----
Bundled reference is the Kyutai-provided American-male sample. Previous
`af_heart` (Kokoro, American-female) is gone. Per-sentence peak-normalized
to −6 dBFS in `tts.rs::normalize_for_playback` with `MAX_GAIN=8.0` to cap
over-amplification of near-silent buffers.

Model bundle
------------
~289 MB pre-download at first huddle (was ~187 MB for Kokoro): five ONNX
models (text+duration+prosody+depth+acoustic), two JSON tables
(vocab + token_scores), one voice WAV, the upstream LICENSE, and a Sprout
`MODEL_LICENSE.txt` attribution sidecar. All files SHA-256-pinned in
`huddle/models.rs`; readiness now fails-closed without the sidecar
(`tts_readiness_requires_license_sidecar` test).

API rename
----------
`VoiceModelStatus.kokoro` → `.tts`; `ModelManager.kokoro_*` →
`ModelManager.tts_*`; `start_kokoro_download` → `start_tts_download`;
`is_kokoro_ready` → `is_tts_ready`; `modelStatus.kokoro` → `modelStatus.tts`
in the frontend. Status surface is now engine-agnostic, so a future TTS
swap won't need another rename cycle.

Verification
------------
- `cargo fmt --check` clean
- `cargo test --lib` 305/305 (was 299; +5 normalization, +1 sidecar
  readiness)
- `cargo test --lib huddle::tts` 24/24
- `cargo build --release --lib` clean in 23s
- `just desktop-tauri-check` clean
- `pnpm build` clean (frontend)
- Full unsigned `tauri build` (aarch64-apple-darwin) produces signed-ready
  Sprout.app + DMG; bundle inspection above confirms no ORT artifacts ship.

How to validate locally
-----------------------
1. Build + launch the desktop app.
2. Join a huddle. Watch stderr for `TTS warmup completed in {N}ms` —
   expect ≤1500ms on Apple Silicon with 8GB+. >1500ms suggests memory
   pressure / paging from concurrent STT+TTS init.
3. Confirm no `OfflineRecognizer::create returned None` in stderr (STT
   load path).
4. Activity Monitor: sprout RSS should settle <800 MB after huddle join.
   If >800 MB, the spawn order can be serialized (load TTS engine before
   spawning STT worker) as a one-line follow-up.

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
---
 desktop/scripts/check-file-sizes.mjs          |   5 +-
 desktop/src-tauri/Cargo.lock                  | 334 +-----
 desktop/src-tauri/Cargo.toml                  |   2 -
 desktop/src-tauri/examples/pocket_bench.rs    | 117 +++
 desktop/src-tauri/src/huddle/kokoro.rs        | 973 ------------------
 desktop/src-tauri/src/huddle/mod.rs           |  16 +-
 desktop/src-tauri/src/huddle/models.rs        | 276 +++--
 desktop/src-tauri/src/huddle/pipeline.rs      |  10 +-
 desktop/src-tauri/src/huddle/pocket.rs        | 238 +++++
 desktop/src-tauri/src/huddle/tts.rs           | 143 ++-
 desktop/src-tauri/src/lib.rs                  |   4 +-
 .../features/huddle/components/HuddleBar.tsx  |  18 +-
 12 files changed, 700 insertions(+), 1436 deletions(-)
 create mode 100644 desktop/src-tauri/examples/pocket_bench.rs
 delete mode 100644 desktop/src-tauri/src/huddle/kokoro.rs
 create mode 100644 desktop/src-tauri/src/huddle/pocket.rs

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index d27752bc8..1d20f3e4f 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -66,13 +66,12 @@ const overrides = new Map([
   ["src/features/settings/ui/ChannelTemplatesSettingsCard.tsx", 850], // template CRUD card + TemplateFormDialog (persona/team chip selectors + provider assignments + canvas template) + TemplateTeamSelector + ProviderAssignments + ProviderRow
   ["src/shared/api/types.ts", 620], // ... + RespondToMode + respondTo/respondToAllowlist on ManagedAgent/Create/Update inputs
   ["src-tauri/src/events.rs", 610], // event builders + build_huddle_guidelines (kind:48106) + post_event_raw transport helper + participant p-tag on join/leave + NIP-43 relay admin builders (add/remove/change-role) + check_relay_role + DM/presence/workflow command builders
-  ["src-tauri/src/huddle/kokoro.rs", 980], // Kokoro ONNX TTS engine + three-tier G2P + ARPAbet→IPA + CoreML + synth_chunk() public API + style validation + hyphenated compound splitting + 23 unit tests
   ["src-tauri/src/huddle/mod.rs", 1020], // huddle state machine + Tauri commands + sync protocol doc; state/relay/pipeline extracted + emit_huddle_state_changed wiring
-  ["src-tauri/src/huddle/models.rs", 900], // model download manager for Parakeet TDT-CTC STT + Kokoro TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + CC-BY-4.0 attribution sidecar + idempotent legacy Moonshine dir cleanup
+  ["src-tauri/src/huddle/models.rs", 930], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test
   ["src-tauri/src/huddle/stt.rs", 580], // STT pipeline + PTT edge-detection flush + PTT gating (is_speech AND ptt_active) + barge-in for VAD mode + rubato resampler + earshot VAD + sherpa-onnx transcription
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
-  ["src-tauri/src/huddle/tts.rs", 1030], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fades + 18 unit tests for remote interrupt mechanism
+  ["src-tauri/src/huddle/tts.rs", 1130], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fades + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 24 unit tests (18 interrupt + 6 fade/normalize)
   ["src-tauri/src/relay.rs", 510], // +4 lines for NIP-OA auth tag injection in profile sync (build_profile_event) + verification test
   ["src-tauri/src/commands/pairing.rs", 600], // NIP-AB pairing actor: 3 Tauri commands + background WS task + NIP-42 auth + NIP-43 probe + event parsing helpers
   ["src-tauri/src/lib.rs", 715], // +4 lines for PairingHandle managed state + 3 pairing command registrations
diff --git a/desktop/src-tauri/Cargo.lock b/desktop/src-tauri/Cargo.lock
index f14134309..a1ef2dac4 100644
--- a/desktop/src-tauri/Cargo.lock
+++ b/desktop/src-tauri/Cargo.lock
@@ -736,9 +736,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.60"
+version = "1.2.62"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
+checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -956,7 +956,7 @@ dependencies = [
  "bitflags 2.11.1",
  "core-foundation 0.10.1",
  "core-graphics-types",
- "foreign-types 0.5.0",
+ "foreign-types",
  "libc",
 ]
 
@@ -1066,25 +1066,6 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "crossbeam-deque"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
-dependencies = [
- "crossbeam-epoch",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-epoch"
-version = "0.9.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
-dependencies = [
- "crossbeam-utils",
-]
-
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
@@ -1236,16 +1217,6 @@ version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
 
-[[package]]
-name = "der"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
-dependencies = [
- "pem-rfc7468",
- "zeroize",
-]
-
 [[package]]
 name = "deranged"
 version = "0.5.8"
@@ -1341,7 +1312,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1474,12 +1445,6 @@ dependencies = [
  "libm",
 ]
 
-[[package]]
-name = "either"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
-
 [[package]]
 name = "embed-resource"
 version = "3.0.8"
@@ -1560,7 +1525,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1660,15 +1625,6 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared 0.1.1",
-]
-
 [[package]]
 name = "foreign-types"
 version = "0.5.0"
@@ -1676,7 +1632,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
 dependencies = [
  "foreign-types-macros",
- "foreign-types-shared 0.3.1",
+ "foreign-types-shared",
 ]
 
 [[package]]
@@ -1690,12 +1646,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "foreign-types-shared"
 version = "0.3.1"
@@ -2199,9 +2149,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.17.0"
+version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
 
 [[package]]
 name = "heck"
@@ -2251,12 +2201,6 @@ dependencies = [
  "digest 0.10.7",
 ]
 
-[[package]]
-name = "hmac-sha256"
-version = "1.1.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec9d92d097f4749b64e8cc33d924d9f40a2d4eb91402b458014b781f5733d60f"
-
 [[package]]
 name = "html5ever"
 version = "0.29.1"
@@ -2563,7 +2507,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.17.0",
+ "hashbrown 0.17.1",
  "serde",
  "serde_core",
 ]
@@ -2812,9 +2756,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.185"
+version = "0.2.186"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
 
 [[package]]
 name = "libdbus-sys"
@@ -2896,12 +2840,6 @@ dependencies = [
  "crc",
 ]
 
-[[package]]
-name = "lzma-rust2"
-version = "0.15.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1670343e58806300d87950e3401e820b519b9384281bbabfb15e3636689ffd69"
-
 [[package]]
 name = "lzma-sys"
 version = "0.1.20"
@@ -2988,16 +2926,6 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
 
-[[package]]
-name = "matrixmultiply"
-version = "0.3.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
-dependencies = [
- "autocfg",
- "rawpointer",
-]
-
 [[package]]
 name = "memchr"
 version = "2.8.0"
@@ -3064,40 +2992,7 @@ dependencies = [
  "png 0.18.1",
  "serde",
  "thiserror 2.0.18",
- "windows-sys 0.61.2",
-]
-
-[[package]]
-name = "native-tls"
-version = "0.2.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
-dependencies = [
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
-[[package]]
-name = "ndarray"
-version = "0.17.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d"
-dependencies = [
- "matrixmultiply",
- "num-complex",
- "num-integer",
- "num-traits",
- "portable-atomic",
- "portable-atomic-util",
- "rawpointer",
- "rayon",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -3608,50 +3503,12 @@ dependencies = [
  "pathdiff",
 ]
 
-[[package]]
-name = "openssl"
-version = "0.10.77"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfe4646e360ec77dff7dde40ed3d6c5fee52d156ef4a62f53973d38294dad87f"
-dependencies = [
- "bitflags 2.11.1",
- "cfg-if",
- "foreign-types 0.3.2",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.113"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad2f2c0eba47118757e4c6d2bff2838f3e0523380021356e7875e858372ce644"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -3687,30 +3544,6 @@ dependencies = [
  "pin-project-lite",
 ]
 
-[[package]]
-name = "ort"
-version = "2.0.0-rc.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7de3af33d24a745ffb8fab904b13478438d1cd52868e6f17735ef6e1f8bf133"
-dependencies = [
- "ndarray",
- "ort-sys",
- "smallvec",
- "tracing",
- "ureq 3.3.0",
-]
-
-[[package]]
-name = "ort-sys"
-version = "2.0.0-rc.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
-dependencies = [
- "hmac-sha256",
- "lzma-rust2",
- "ureq 3.3.0",
-]
-
 [[package]]
 name = "osakit"
 version = "0.3.1"
@@ -3806,15 +3639,6 @@ dependencies = [
  "hmac",
 ]
 
-[[package]]
-name = "pem-rfc7468"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
-dependencies = [
- "base64ct",
-]
-
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -4087,21 +3911,6 @@ dependencies = [
  "universal-hash",
 ]
 
-[[package]]
-name = "portable-atomic"
-version = "1.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
-
-[[package]]
-name = "portable-atomic-util"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
-dependencies = [
- "portable-atomic",
-]
-
 [[package]]
 name = "potential_utf"
 version = "0.1.5"
@@ -4457,32 +4266,6 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539"
 
-[[package]]
-name = "rawpointer"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
-
-[[package]]
-name = "rayon"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
-dependencies = [
- "either",
- "rayon-core",
-]
-
-[[package]]
-name = "rayon-core"
-version = "1.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
-dependencies = [
- "crossbeam-deque",
- "crossbeam-utils",
-]
-
 [[package]]
 name = "realfft"
 version = "3.5.0"
@@ -4777,7 +4560,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4810,9 +4593,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.14.0"
+version = "1.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
 dependencies = [
  "web-time",
  "zeroize",
@@ -4836,7 +4619,7 @@ dependencies = [
  "security-framework",
  "security-framework-sys",
  "webpki-root-certs",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5312,7 +5095,7 @@ checksum = "384b71e9586b28493902080ed89e9b4457d2c684521d4c10df62c381b6fdcb51"
 dependencies = [
  "bzip2 0.4.4",
  "tar",
- "ureq 2.12.1",
+ "ureq",
 ]
 
 [[package]]
@@ -5368,18 +5151,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
-]
-
-[[package]]
-name = "socks"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
-dependencies = [
- "byteorder",
- "libc",
- "winapi",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -5446,11 +5218,9 @@ dependencies = [
  "hex",
  "infer",
  "libc",
- "ndarray",
  "nostr 0.36.0",
  "nostr 0.37.0",
  "opus",
- "ort",
  "png 0.18.1",
  "regex",
  "reqwest 0.13.2",
@@ -6369,7 +6139,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -6792,7 +6562,7 @@ dependencies = [
  "png 0.18.1",
  "serde",
  "thiserror 2.0.18",
- "windows-sys 0.61.2",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -6858,7 +6628,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e"
 dependencies = [
  "memoffset",
  "tempfile",
- "windows-sys 0.61.2",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -6967,36 +6737,6 @@ dependencies = [
  "webpki-roots 0.26.11",
 ]
 
-[[package]]
-name = "ureq"
-version = "3.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
-dependencies = [
- "base64 0.22.1",
- "der",
- "log",
- "native-tls",
- "percent-encoding",
- "rustls-pki-types",
- "socks",
- "ureq-proto",
- "utf8-zero",
- "webpki-root-certs",
-]
-
-[[package]]
-name = "ureq-proto"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
-dependencies = [
- "base64 0.22.1",
- "http",
- "httparse",
- "log",
-]
-
 [[package]]
 name = "url"
 version = "2.5.8"
@@ -7028,12 +6768,6 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
-[[package]]
-name = "utf8-zero"
-version = "0.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e"
-
 [[package]]
 name = "utf8_iter"
 version = "1.0.4"
@@ -7052,12 +6786,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version-compare"
 version = "0.2.1"
@@ -7134,11 +6862,11 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 
 [[package]]
 name = "wasip2"
-version = "1.0.2+wasi-0.2.9"
+version = "1.0.3+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
 dependencies = [
- "wit-bindgen",
+ "wit-bindgen 0.57.1",
 ]
 
 [[package]]
@@ -7147,7 +6875,7 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
 dependencies = [
- "wit-bindgen",
+ "wit-bindgen 0.51.0",
 ]
 
 [[package]]
@@ -7330,9 +7058,9 @@ dependencies = [
 
 [[package]]
 name = "webpki-root-certs"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
+checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -7413,7 +7141,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -7942,6 +7670,12 @@ dependencies = [
  "wit-bindgen-rust-macro",
 ]
 
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
 [[package]]
 name = "wit-bindgen-core"
 version = "0.51.0"
diff --git a/desktop/src-tauri/Cargo.toml b/desktop/src-tauri/Cargo.toml
index 8720843b0..f1d6067c6 100644
--- a/desktop/src-tauri/Cargo.toml
+++ b/desktop/src-tauri/Cargo.toml
@@ -63,8 +63,6 @@ uuid = { version = "1", features = ["v4"] }
 png = "0.18"
 zip = "2"
 sherpa-onnx = "1.12"
-ort = { version = "=2.0.0-rc.12", default-features = false, features = ["std", "ndarray", "tracing", "download-binaries", "tls-native", "copy-dylibs", "api-23", "coreml"] }
-ndarray = { version = "0.17", features = ["rayon"] }
 regex = "1"
 axum = "0.8"
 rodio = "0.22"
diff --git a/desktop/src-tauri/examples/pocket_bench.rs b/desktop/src-tauri/examples/pocket_bench.rs
new file mode 100644
index 000000000..3df111409
--- /dev/null
+++ b/desktop/src-tauri/examples/pocket_bench.rs
@@ -0,0 +1,117 @@
+//! Cold-vs-warm latency bench for Pocket TTS.
+//!
+//! This duplicates the small config-building snippet from `huddle::pocket` so it
+//! doesn't depend on changing module visibility for a one-off dev tool.
+//! Keep in sync with `huddle::pocket::load_text_to_speech`.
+//!
+//! Run with the model files in a directory (defaults to /tmp/pocket-tts-bench):
+//!   cargo run --release --example pocket_bench
+//!   cargo run --release --example pocket_bench /path/to/pocket-tts
+
+use std::path::PathBuf;
+use std::time::Instant;
+
+use sherpa_onnx::{
+    self, GenerationConfig, OfflineTts, OfflineTtsConfig, OfflineTtsModelConfig,
+    OfflineTtsPocketModelConfig, Wave,
+};
+
+const SAMPLE_RATE: u32 = 24_000;
+const TEST_TEXT: &str =
+    "Hello, this is a test of the new Pocket TTS engine running on sherpa-onnx.";
+
+fn main() {
+    let model_dir = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| "/tmp/pocket-tts-bench".to_string());
+    println!("Model dir: {model_dir}");
+
+    let dir = PathBuf::from(&model_dir);
+    let p = |name: &str| dir.join(name).to_string_lossy().into_owned();
+
+    let t0 = Instant::now();
+    let mut cfg = OfflineTtsConfig::default();
+    cfg.model = OfflineTtsModelConfig {
+        pocket: OfflineTtsPocketModelConfig {
+            lm_main: Some(p("lm_main.int8.onnx")),
+            lm_flow: Some(p("lm_flow.int8.onnx")),
+            encoder: Some(p("encoder.onnx")),
+            decoder: Some(p("decoder.int8.onnx")),
+            text_conditioner: Some(p("text_conditioner.onnx")),
+            vocab_json: Some(p("vocab.json")),
+            token_scores_json: Some(p("token_scores.json")),
+            voice_embedding_cache_capacity: 16,
+        },
+        num_threads: 1,
+        debug: false,
+        ..Default::default()
+    };
+    let engine = OfflineTts::create(&cfg).expect("engine create");
+    let load_ms = t0.elapsed().as_secs_f32() * 1000.0;
+    println!("Engine load:      {load_ms:.1} ms");
+
+    let t0 = Instant::now();
+    let voice_path = dir.join("reference_sample.wav");
+    let wave = Wave::read(voice_path.to_str().unwrap()).expect("voice WAV");
+    let samples = wave.samples().to_vec();
+    let sr = wave.sample_rate();
+    let voice_ms = t0.elapsed().as_secs_f32() * 1000.0;
+    println!("Voice load:       {voice_ms:.1} ms");
+
+    let gen = || GenerationConfig {
+        speed: 1.05,
+        num_steps: 1,
+        silence_scale: 0.0,
+        reference_audio: Some(samples.clone()),
+        reference_sample_rate: sr,
+        ..Default::default()
+    };
+
+    // Cold synth — first call after engine creation.
+    let t0 = Instant::now();
+    let cold = engine
+        .generate_with_config(TEST_TEXT, &gen(), None::<fn(&[f32], f32) -> bool>)
+        .expect("cold synth");
+    let cold_ms = t0.elapsed().as_secs_f32() * 1000.0;
+    let cold_audio_ms = (cold.samples().len() as f32 / SAMPLE_RATE as f32) * 1000.0;
+    let cold_rtf_x = cold_audio_ms / cold_ms;
+    println!(
+        "Cold synth:       {cold_ms:.1} ms  → {cold_audio_ms:.1} ms audio  → {cold_rtf_x:.2}× realtime"
+    );
+
+    // Warm synth — second call.
+    let t0 = Instant::now();
+    let warm = engine
+        .generate_with_config(TEST_TEXT, &gen(), None::<fn(&[f32], f32) -> bool>)
+        .expect("warm synth");
+    let warm_ms = t0.elapsed().as_secs_f32() * 1000.0;
+    let warm_audio_ms = (warm.samples().len() as f32 / SAMPLE_RATE as f32) * 1000.0;
+    let warm_rtf_x = warm_audio_ms / warm_ms;
+    println!(
+        "Warm synth:       {warm_ms:.1} ms  → {warm_audio_ms:.1} ms audio  → {warm_rtf_x:.2}× realtime"
+    );
+
+    // Write warm output for listening.
+    let out_path = "/tmp/pocket_bench_out.wav";
+    let ok = sherpa_onnx::write(out_path, warm.samples(), SAMPLE_RATE as i32);
+    println!(
+        "Wrote {} ({} samples, ok={ok})",
+        out_path,
+        warm.samples().len()
+    );
+
+    let delta_ms = cold_ms - warm_ms;
+    let delta_pct = (delta_ms / warm_ms) * 100.0;
+    println!();
+    println!("Cold/warm delta:  {delta_ms:+.1} ms  ({delta_pct:+.1}%)");
+    println!(
+        "Decision: warmup {}.",
+        if delta_ms > 200.0 {
+            "RECOMMENDED — significant cold-call penalty"
+        } else if delta_ms > 50.0 {
+            "OPTIONAL — small cold-call penalty"
+        } else {
+            "UNNECESSARY — cold and warm essentially equal"
+        }
+    );
+}
diff --git a/desktop/src-tauri/src/huddle/kokoro.rs b/desktop/src-tauri/src/huddle/kokoro.rs
deleted file mode 100644
index 464407e0d..000000000
--- a/desktop/src-tauri/src/huddle/kokoro.rs
+++ /dev/null
@@ -1,973 +0,0 @@
-//! Kokoro-82M ONNX TTS engine — single-session inference with IPA G2P.
-//!
-//! Mental model:
-//!
-//!   load_text_to_speech(model_dir) → KokoroTTS
-//!   load_voice_style(path)         → VoiceStyle
-//!   tts.synth_chunk(text, lang, &style, steps, speed) → Vec<f32> @ 24 kHz
-//!
-//!   ┌──────────┐   G2P    ┌──────────┐  tokenize  ┌──────────┐
-//!   │ raw text │ ──────→  │ IPA str  │ ─────────→ │ int64[]  │
-//!   └──────────┘ lexicon  └──────────┘  115-char   └────┬─────┘
-//!                                                        │
-//!   ┌──────────┐  style   ┌──────────┐  ONNX      ┌────▼─────┐
-//!   │ .bin file│ ──────→  │ [1, 256] │ ─────────→ │ Vec<f32> │
-//!   └──────────┘ indexed  └──────────┘  session    └──────────┘
-//!               by token count                      24 kHz PCM
-//!
-//! G2P strategy: dictionary lookup (us_gold.json, Apache-2.0 via misaki).
-//! OOV words are spelled letter-by-letter using a static IPA table.
-//! No espeak dependency — fully GPL-free.
-
-use std::collections::HashMap;
-use std::fs;
-use std::path::{Path, PathBuf};
-
-use ndarray::{Array1, Array2};
-use ort::{session::Session, value::Value};
-
-// ── Public constants ──────────────────────────────────────────────────────────
-
-pub const SAMPLE_RATE: u32 = 24_000;
-pub const DEFAULT_VOICE: &str = "af_heart";
-
-// Maximum phoneme tokens before padding (model context = 512, minus 2 pad tokens).
-const MAX_PHONEME_TOKENS: usize = 510;
-
-// ── VoiceStyle ────────────────────────────────────────────────────────────────
-
-/// Raw f32 voice embedding loaded from a `<name>.bin` file.
-///
-/// The binary is a flat array of shape `[-1, 256]` in row-major order.
-/// Row `i` is the style vector for an utterance with `i` phoneme tokens.
-/// This encodes both speaker identity and sequence-length-dependent prosody.
-#[derive(Debug)]
-pub struct VoiceStyle {
-    data: Vec<f32>, // flat: row i = data[i*256 .. (i+1)*256]
-}
-
-impl VoiceStyle {
-    /// Return the 256-dim style vector for a given phoneme token count.
-    /// Clamps to the last available row if `token_count` is out of range.
-    fn get(&self, token_count: usize) -> &[f32] {
-        let max_rows = self.data.len() / 256;
-        let idx = token_count.min(max_rows.saturating_sub(1));
-        &self.data[idx * 256..(idx + 1) * 256]
-    }
-}
-
-/// Load a voice style from a raw little-endian f32 binary file.
-pub fn load_voice_style(path: &Path) -> Result<VoiceStyle, String> {
-    let bytes = fs::read(path).map_err(|e| format!("read voice {}: {e}", path.display()))?;
-    if bytes.len() % 4 != 0 {
-        return Err(format!(
-            "voice file {} has non-multiple-of-4 byte count ({})",
-            path.display(),
-            bytes.len()
-        ));
-    }
-    let data: Vec<f32> = bytes
-        .chunks_exact(4)
-        .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
-        .collect();
-    if data.len() < 256 {
-        return Err(format!(
-            "voice file {} too small ({} floats, need at least 256)",
-            path.display(),
-            data.len()
-        ));
-    }
-    if data.len() % 256 != 0 {
-        return Err(format!(
-            "voice style has {} floats — expected a multiple of 256 (got {} remainder)",
-            data.len(),
-            data.len() % 256,
-        ));
-    }
-    Ok(VoiceStyle { data })
-}
-
-// ── Tokenizer ─────────────────────────────────────────────────────────────────
-
-/// Static 115-entry IPA char → int64 lookup table.
-/// IDs are non-contiguous (0–177); unknown chars are silently dropped.
-/// Pad token '$' = 0 is prepended and appended to every sequence.
-fn build_vocab() -> HashMap<char, i64> {
-    // Source: onnx-community/Kokoro-82M-v1.0-ONNX tokenizer.json
-    #[rustfmt::skip]
-    let entries: &[(char, i64)] = &[
-        ('$', 0),
-        (';', 1), (':', 2), (',', 3), ('.', 4), ('!', 5), ('?', 6),
-        ('—', 9), ('…', 10), ('"', 11), ('(', 12), (')', 13), ('\u{201c}', 14), ('\u{201d}', 15),
-        (' ', 16), ('\u{0303}', 17),
-        ('ʣ', 18), ('ʥ', 19), ('ʦ', 20), ('ʨ', 21), ('ᵝ', 22), ('ꭧ', 23),
-        ('A', 24), ('I', 25), ('O', 31), ('Q', 33), ('S', 35), ('T', 36),
-        ('W', 39), ('Y', 41), ('ᵊ', 42),
-        ('a', 43), ('b', 44), ('c', 45), ('d', 46), ('e', 47), ('f', 48),
-        ('h', 50), ('i', 51), ('j', 52), ('k', 53), ('l', 54), ('m', 55),
-        ('n', 56), ('o', 57), ('p', 58), ('q', 59), ('r', 60), ('s', 61),
-        ('t', 62), ('u', 63), ('v', 64), ('w', 65), ('x', 66), ('y', 67), ('z', 68),
-        ('ɑ', 69), ('ɐ', 70), ('ɒ', 71), ('æ', 72), ('β', 75), ('ɔ', 76),
-        ('ɕ', 77), ('ç', 78), ('ɖ', 80), ('ð', 81), ('ʤ', 82), ('ə', 83),
-        ('ɚ', 85), ('ɛ', 86), ('ɜ', 87), ('ɟ', 90), ('ɡ', 92), ('ɥ', 99),
-        ('ɨ', 101), ('ɪ', 102), ('ʝ', 103), ('ɯ', 110), ('ɰ', 111),
-        ('ŋ', 112), ('ɳ', 113), ('ɲ', 114), ('ɴ', 115), ('ø', 116),
-        ('ɸ', 118), ('θ', 119), ('œ', 120), ('ɹ', 123), ('ɾ', 125),
-        ('ɻ', 126), ('ʁ', 128), ('ɽ', 129), ('ʂ', 130), ('ʃ', 131),
-        ('ʈ', 132), ('ʧ', 133), ('ʊ', 135), ('ʋ', 136), ('ʌ', 138),
-        ('ɣ', 139), ('ɤ', 140), ('χ', 142), ('ʎ', 143), ('ʒ', 147),
-        ('ʔ', 148), ('ˈ', 156), ('ˌ', 157), ('ː', 158), ('ʰ', 162),
-        ('ʲ', 164), ('↓', 169), ('→', 171), ('↗', 172), ('↘', 173), ('ᵻ', 177),
-    ];
-    entries.iter().copied().collect()
-}
-
-/// Convert an IPA phoneme string to a padded int64 token sequence.
-/// Returns `[0, id1, id2, ..., idN, 0]` clamped to MAX_PHONEME_TOKENS+2.
-/// The pre-pad token count (ids.len() - 2) is used to index the style vector.
-fn tokenize(phonemes: &str, vocab: &HashMap<char, i64>) -> Vec<i64> {
-    let mut ids: Vec<i64> = vec![0]; // BOS pad
-    for id in phonemes
-        .chars()
-        .filter_map(|c| vocab.get(&c).copied())
-        .take(MAX_PHONEME_TOKENS)
-    {
-        ids.push(id);
-    }
-    ids.push(0); // EOS pad
-    ids
-}
-
-// ── G2P Lexicon ───────────────────────────────────────────────────────────────
-
-/// Grapheme-to-phoneme engine with a four-tier fallback chain:
-///
-///   1. Misaki gold+silver dicts (183K words, Kokoro-native IPA)
-///   2. CMUdict (135K words, ARPAbet→Kokoro IPA) — covers inflected forms
-///   3. Morphological suffix stripping (-s/-ed/-ing) + retry tiers 1-2
-///   4. Letter-by-letter spelling
-///
-/// All dictionaries are Apache-2.0 or BSD licensed. No GPL.
-struct Lexicon {
-    /// Misaki gold+silver merged dictionary (Kokoro-native IPA).
-    misaki: HashMap<String, String>,
-    /// CMU Pronouncing Dictionary (ARPAbet converted to Kokoro IPA at load time).
-    cmudict: HashMap<String, String>,
-}
-
-/// IPA pronunciations for individual letter names (used for OOV words).
-fn letter_ipa(c: char) -> &'static str {
-    match c {
-        'a' => "ˈeɪ",
-        'b' => "bˈiː",
-        'c' => "sˈiː",
-        'd' => "dˈiː",
-        'e' => "ˈiː",
-        'f' => "ˈɛf",
-        'g' => "dʒˈiː",
-        'h' => "ˈeɪtʃ",
-        'i' => "ˈaɪ",
-        'j' => "dʒˈeɪ",
-        'k' => "kˈeɪ",
-        'l' => "ˈɛl",
-        'm' => "ˈɛm",
-        'n' => "ˈɛn",
-        'o' => "ˈoʊ",
-        'p' => "pˈiː",
-        'q' => "kjˈuː",
-        'r' => "ˈɑːɹ",
-        's' => "ˈɛs",
-        't' => "tˈiː",
-        'u' => "jˈuː",
-        'v' => "vˈiː",
-        'w' => "dˈʌbəljˌuː",
-        'x' => "ˈɛks",
-        'y' => "wˈaɪ",
-        'z' => "zˈiː",
-        _ => "",
-    }
-}
-
-/// Punctuation chars that are valid Kokoro vocab tokens and should pass through.
-fn is_passthrough_punct(c: char) -> bool {
-    matches!(c, ';' | ':' | ',' | '.' | '!' | '?' | '—' | '…' | ' ')
-}
-
-/// Vowels that trigger US English /t/→/ɾ/ flapping (misaki's US_TAUS).
-const US_TAUS: &str = "AIOWYiuæɑəɛɪɹʊʌ";
-
-/// ARPAbet → Kokoro IPA conversion. Stress digit is stripped before lookup.
-fn arpabet_to_ipa(phoneme: &str) -> &'static str {
-    match phoneme {
-        "AA" => "ɑ",
-        "AE" => "æ",
-        "AH" => "ʌ",
-        "AO" => "ɔ",
-        "AW" => "W",
-        "AY" => "I",
-        "EH" => "ɛ",
-        "ER" => "ɜɹ",
-        "EY" => "A",
-        "IH" => "ɪ",
-        "IY" => "i",
-        "OW" => "O",
-        "OY" => "Y",
-        "UH" => "ʊ",
-        "UW" => "u",
-        "B" => "b",
-        "CH" => "ʧ",
-        "D" => "d",
-        "DH" => "ð",
-        "F" => "f",
-        "G" => "ɡ",
-        "HH" => "h",
-        "JH" => "ʤ",
-        "K" => "k",
-        "L" => "l",
-        "M" => "m",
-        "N" => "n",
-        "NG" => "ŋ",
-        "P" => "p",
-        "R" => "ɹ",
-        "S" => "s",
-        "SH" => "ʃ",
-        "T" => "t",
-        "TH" => "θ",
-        "V" => "v",
-        "W" => "w",
-        "Y" => "j",
-        "Z" => "z",
-        "ZH" => "ʒ",
-        _ => "",
-    }
-}
-
-/// Convert a CMUdict ARPAbet pronunciation line to Kokoro IPA.
-/// Input: "K R IY0 EY1 T AH0 D" → Output: "kɹiˈAtəd"
-fn arpabet_line_to_ipa(arpabet: &str) -> String {
-    let mut out = String::new();
-    for token in arpabet.split_whitespace() {
-        // Split phoneme from stress digit (e.g., "EY1" → "EY", Some('1'))
-        let (base, stress) = if token.ends_with(|c: char| c.is_ascii_digit()) {
-            (&token[..token.len() - 1], token.as_bytes().last().copied())
-        } else {
-            (token, None)
-        };
-        // Stress marker goes BEFORE the vowel's IPA
-        match stress {
-            Some(b'1') => out.push('ˈ'), // primary
-            Some(b'2') => out.push('ˌ'), // secondary
-            _ => {}
-        }
-        // AH with stress=0 is schwa (ə), not ʌ
-        if base == "AH" && stress == Some(b'0') {
-            out.push('ə');
-        } else if base == "ER" && stress == Some(b'0') {
-            // Unstressed ER is just əɹ
-            out.push_str("əɹ");
-        } else {
-            out.push_str(arpabet_to_ipa(base));
-        }
-    }
-    out
-}
-
-impl Lexicon {
-    /// Load misaki gold+silver dicts and CMUdict.
-    fn load(gold_path: &Path, silver_path: &Path, cmudict_path: &Path) -> Result<Self, String> {
-        let mut misaki = Self::load_json(silver_path)?;
-        let gold = Self::load_json(gold_path)?;
-        misaki.extend(gold);
-
-        let cmudict = if cmudict_path.exists() {
-            Self::load_cmudict(cmudict_path)?
-        } else {
-            eprintln!(
-                "sprout-desktop: CMUdict not found at {} — inflected forms may be spelled out",
-                cmudict_path.display()
-            );
-            HashMap::new()
-        };
-
-        eprintln!(
-            "sprout-desktop: G2P loaded — misaki: {} words, cmudict: {} words",
-            misaki.len(),
-            cmudict.len()
-        );
-        Ok(Lexicon { misaki, cmudict })
-    }
-
-    fn load_json(path: &Path) -> Result<HashMap<String, String>, String> {
-        let content =
-            fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))?;
-        let raw: serde_json::Value =
-            serde_json::from_str(&content).map_err(|e| format!("parse {}: {e}", path.display()))?;
-        let obj = raw
-            .as_object()
-            .ok_or_else(|| format!("{}: expected JSON object", path.display()))?;
-        let mut dict = HashMap::with_capacity(obj.len());
-        for (word, val) in obj {
-            let ipa = match val {
-                serde_json::Value::String(s) => s.clone(),
-                serde_json::Value::Object(m) => m
-                    .get("DEFAULT")
-                    .or_else(|| m.values().next())
-                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
-                    .to_string(),
-                _ => continue,
-            };
-            if !ipa.is_empty() {
-                dict.insert(word.to_lowercase(), ipa);
-            }
-        }
-        Ok(dict)
-    }
-
-    /// Load CMUdict and convert ARPAbet → Kokoro IPA at load time.
-    /// Format: "WORD PH1 PH2 PH3\n" (single space between word and phonemes).
-    /// Variant pronunciations like "WORD(2)" are skipped — we take the first.
-    fn load_cmudict(path: &Path) -> Result<HashMap<String, String>, String> {
-        let content = fs::read_to_string(path).map_err(|e| format!("read cmudict: {e}"))?;
-        let mut dict = HashMap::with_capacity(140_000);
-        for line in content.lines() {
-            // Skip comments and blank lines
-            if line.starts_with(";;;") || line.is_empty() {
-                continue;
-            }
-            // Split on first space
-            let (word, phonemes) = match line.find(' ') {
-                Some(i) => (&line[..i], line[i + 1..].trim()),
-                None => continue,
-            };
-            // Skip variant pronunciations like "WORD(2)"
-            if word.contains('(') {
-                continue;
-            }
-            let key = word.to_lowercase();
-            let ipa = arpabet_line_to_ipa(phonemes);
-            if !ipa.is_empty() {
-                dict.entry(key).or_insert(ipa);
-            }
-        }
-        Ok(dict)
-    }
-
-    /// Look up a word across all tiers. Returns None if not found anywhere.
-    fn lookup(&self, word: &str) -> Option<String> {
-        self.misaki
-            .get(word)
-            .cloned()
-            .or_else(|| self.cmudict.get(word).cloned())
-    }
-
-    /// Apply English -s/-es/-ies suffix phoneme rules (misaki's `_s`).
-    fn apply_s(stem_ipa: &str) -> String {
-        let last = stem_ipa.chars().last().unwrap_or(' ');
-        if "ptkfθ".contains(last) {
-            format!("{stem_ipa}s")
-        } else if "szʃʒʧʤ".contains(last) {
-            format!("{stem_ipa}ᵻz")
-        } else {
-            format!("{stem_ipa}z")
-        }
-    }
-
-    /// Apply English -ed suffix phoneme rules (misaki's `_ed`).
-    fn apply_ed(stem_ipa: &str) -> String {
-        let chars: Vec<char> = stem_ipa.chars().collect();
-        let last = *chars.last().unwrap_or(&' ');
-        if "pkfθʃsʧ".contains(last) {
-            format!("{stem_ipa}t")
-        } else if last == 'd' {
-            format!("{stem_ipa}ᵻd")
-        } else if last != 't' {
-            format!("{stem_ipa}d")
-        } else if chars.len() >= 2 && US_TAUS.contains(chars[chars.len() - 2]) {
-            // US flap: "created" → kɹiˈAɾᵻd
-            let mut out: String = chars[..chars.len() - 1].iter().collect();
-            out.push_str("ɾᵻd");
-            out
-        } else {
-            format!("{stem_ipa}ᵻd")
-        }
-    }
-
-    /// Apply English -ing suffix phoneme rules (misaki's `_ing`).
-    fn apply_ing(stem_ipa: &str) -> String {
-        let chars: Vec<char> = stem_ipa.chars().collect();
-        let last = *chars.last().unwrap_or(&' ');
-        if last == 't' && chars.len() >= 2 && US_TAUS.contains(chars[chars.len() - 2]) {
-            // US flap: "creating" → kɹiˈAɾɪŋ
-            let mut out: String = chars[..chars.len() - 1].iter().collect();
-            out.push_str("ɾɪŋ");
-            out
-        } else {
-            format!("{stem_ipa}ɪŋ")
-        }
-    }
-
-    /// Try stripping -s/-ed/-ing suffix, look up the base, and re-apply phonetically.
-    fn try_morphological(&self, word: &str) -> Option<String> {
-        // Try -s / -es / -ies
-        if word.len() >= 3 && word.ends_with('s') {
-            // -ies → base + y
-            if word.len() > 4 && word.ends_with("ies") {
-                if let Some(stem) = self.lookup(&format!("{}y", &word[..word.len() - 3])) {
-                    return Some(Self::apply_s(&stem));
-                }
-            }
-            // -es → base
-            if word.len() > 4 && word.ends_with("es") && !word.ends_with("ies") {
-                if let Some(stem) = self.lookup(&word[..word.len() - 2]) {
-                    return Some(Self::apply_s(&stem));
-                }
-            }
-            // -s → base
-            if !word.ends_with("ss") {
-                if let Some(stem) = self.lookup(&word[..word.len() - 1]) {
-                    return Some(Self::apply_s(&stem));
-                }
-            }
-        }
-        // Try -ed / -d
-        if word.len() >= 4 && word.ends_with('d') {
-            // -ed → base (not -eed)
-            if word.len() > 4 && word.ends_with("ed") && !word.ends_with("eed") {
-                if let Some(stem) = self.lookup(&word[..word.len() - 2]) {
-                    return Some(Self::apply_ed(&stem));
-                }
-                // -ed where base ends in e: "created" → "create"
-                if let Some(stem) = self.lookup(&format!("{}e", &word[..word.len() - 2])) {
-                    return Some(Self::apply_ed(&stem));
-                }
-            }
-            // -d → base (e.g., "discovered" → strip "d" → "discovere" fails,
-            // but "configured" → strip "d" → "configure" works)
-            if !word.ends_with("dd") {
-                if let Some(stem) = self.lookup(&word[..word.len() - 1]) {
-                    return Some(Self::apply_ed(&stem));
-                }
-            }
-        }
-        // Try -ing
-        if word.len() >= 5 && word.ends_with("ing") {
-            let base = &word[..word.len() - 3];
-            // -ing → base (e.g., "running" base = "runn" — won't match, need double-consonant)
-            if let Some(stem) = self.lookup(base) {
-                return Some(Self::apply_ing(&stem));
-            }
-            // -ing + e → base+e (e.g., "creating" → "creat" + "e" = "create")
-            if let Some(stem) = self.lookup(&format!("{base}e")) {
-                return Some(Self::apply_ing(&stem));
-            }
-            // Double consonant: "running" → "run"
-            if base.len() >= 2 {
-                let bytes = base.as_bytes();
-                if bytes[bytes.len() - 1] == bytes[bytes.len() - 2] {
-                    if let Some(stem) = self.lookup(&base[..base.len() - 1]) {
-                        return Some(Self::apply_ing(&stem));
-                    }
-                }
-            }
-        }
-        None
-    }
-
-    /// Convert a single word to IPA using the full fallback chain.
-    fn word_to_ipa(&self, word: &str) -> String {
-        // Compound words: split on hyphens and underscores, process each part
-        // independently. "short-and-natural" → "short" + "and" + "natural",
-        // "parent_event_id" → "parent" + "event" + "id".
-        // Each part gets full dict lookup. Joined with a space (brief TTS pause).
-        if word.contains('-') || word.contains('_') {
-            let parts: Vec<String> = word
-                .split(|c: char| c == '-' || c == '_')
-                .filter(|p| !p.is_empty())
-                .map(|p| self.word_to_ipa(p))
-                .collect();
-            return parts.join(" ");
-        }
-
-        // Normalize curly quotes to straight apostrophes.
-        let normalized = word.replace('\u{2019}', "'").replace('\u{2018}', "'");
-        let stripped: String = normalized
-            .chars()
-            .filter(|c| c.is_alphabetic() || *c == '\'')
-            .collect::<String>()
-            .to_lowercase();
-
-        // Tier 1+2: misaki + CMUdict direct lookup
-        if let Some(ipa) = self.lookup(&stripped) {
-            return ipa;
-        }
-
-        // Contractions: "don't" → "don" + "'t"
-        if let Some(apos_idx) = stripped.find('\'') {
-            let base = &stripped[..apos_idx];
-            let suffix = &stripped[apos_idx..];
-            if let Some(base_ipa) = self.lookup(base) {
-                let suffix_ipa = self.lookup(suffix).unwrap_or_else(|| match suffix {
-                    "'ve" => "v".to_string(),
-                    "'re" => "ɹ".to_string(),
-                    _ => String::new(),
-                });
-                if !suffix_ipa.is_empty() {
-                    return format!("{base_ipa}{suffix_ipa}");
-                }
-            }
-        }
-
-        // Tier 3: morphological suffix stripping
-        if let Some(ipa) = self.try_morphological(&stripped) {
-            return ipa;
-        }
-
-        // Tier 4: letter-by-letter spelling
-        stripped
-            .chars()
-            .filter(|c| c.is_alphabetic())
-            .map(letter_ipa)
-            .collect()
-    }
-
-    /// Convert a full text chunk to an IPA phoneme string.
-    fn text_to_ipa(&self, text: &str) -> String {
-        let mut out = String::new();
-        for token in text.split_whitespace() {
-            if !out.is_empty() {
-                out.push(' ');
-            }
-            let leading: String = token
-                .chars()
-                .take_while(|c| is_passthrough_punct(*c))
-                .collect();
-            let trailing: String = token
-                .chars()
-                .rev()
-                .take_while(|c| is_passthrough_punct(*c))
-                .collect::<String>()
-                .chars()
-                .rev()
-                .collect();
-            let word = &token[leading.len()..token.len() - trailing.len()];
-            out.push_str(&leading);
-            if !word.is_empty() {
-                out.push_str(&self.word_to_ipa(word));
-            }
-            out.push_str(&trailing);
-        }
-        out
-    }
-}
-
-// ── KokoroTTS ─────────────────────────────────────────────────────────────────
-
-pub struct KokoroTTS {
-    session: Session,
-    vocab: HashMap<char, i64>,
-    lexicon: Lexicon,
-    // Retained for potential future use (e.g., hot-reloading voices by path).
-    #[allow(dead_code)]
-    model_dir: PathBuf,
-}
-
-/// Load the Kokoro TTS engine from a model directory.
-///
-/// Expects:
-///   `<model_dir>/model.onnx`  (or model_quantized.onnx — tries both)
-///   `<model_dir>/us_gold.json` (G2P dictionary)
-///
-/// CoreML execution provider is registered with auto-fallback to CPU.
-/// The compiled CoreML model is cached in `<model_dir>/.coreml_cache/`.
-pub fn load_text_to_speech(model_dir: &str) -> Result<KokoroTTS, String> {
-    let model_dir_path = PathBuf::from(model_dir);
-
-    // Try quantized model first for speed, fall back to full-precision.
-    let model_path = ["model_quantized.onnx", "model_q8f16.onnx", "model.onnx"]
-        .iter()
-        .map(|name| model_dir_path.join(name))
-        .find(|p| p.exists())
-        .ok_or_else(|| format!("no model.onnx found in {model_dir}"))?;
-
-    // ── Session threading options ────────────────────────────────────────
-    //
-    // parallel_execution — runs independent graph operators concurrently.
-    //   Kokoro's graph is mostly sequential, so the benefit is modest, but
-    //   it's safe and free to enable.
-    // intra_threads(0) — lets ONNX Runtime use all available CPU cores for
-    //   parallelism within individual operators (e.g., large matmuls).
-    //
-    // NOTE: GraphOptimizationLevel::All is already the ort default — no
-    // need to set it explicitly. Accuracy-altering flags (approximate_gelu,
-    // flush_to_zero) are intentionally omitted — they need A/B audio
-    // validation before enabling for a TTS model. memory_pattern is omitted
-    // because Kokoro input lengths vary per sentence and the ORT docs warn
-    // against it for variable-size inputs.
-
-    // Try CoreML first (zero binary cost — macOS system framework).
-    // If the model has ops CoreML can't handle (common with quantized models),
-    // the EP registers fine but commit_from_file fails. Catch that and retry
-    // with CPU-only. This is the expected path for model_q8f16.onnx.
-    let session = {
-        let mut builder_with_coreml = Session::builder()
-            .map_err(|e| format!("session builder: {e}"))?
-            .with_parallel_execution(true)
-            .map_err(|e| format!("parallel execution: {e}"))?
-            .with_intra_threads(0)
-            .map_err(|e| format!("intra threads: {e}"))?
-            .with_execution_providers([ort::ep::CoreML::default()
-                .with_compute_units(ort::ep::coreml::ComputeUnits::All)
-                .with_model_format(ort::ep::coreml::ModelFormat::MLProgram)
-                .with_model_cache_dir(model_dir_path.join(".coreml_cache").to_string_lossy())
-                .build()])
-            .map_err(|e| format!("execution provider: {e}"))?;
-
-        match builder_with_coreml.commit_from_file(&model_path) {
-            Ok(s) => {
-                eprintln!("sprout-desktop: Kokoro loaded with CoreML acceleration");
-                s
-            }
-            Err(coreml_err) => {
-                eprintln!(
-                    "sprout-desktop: CoreML failed for {}, falling back to CPU: {coreml_err}",
-                    model_path.display()
-                );
-                // Retry without any execution providers — pure CPU.
-                Session::builder()
-                    .map_err(|e| format!("session builder (CPU fallback): {e}"))?
-                    .with_parallel_execution(true)
-                    .map_err(|e| format!("parallel execution (CPU): {e}"))?
-                    .with_intra_threads(0)
-                    .map_err(|e| format!("intra threads (CPU): {e}"))?
-                    .commit_from_file(&model_path)
-                    .map_err(|e| format!("load model {} (CPU): {e}", model_path.display()))?
-            }
-        }
-    };
-
-    let gold_path = model_dir_path.join("us_gold.json");
-    let silver_path = model_dir_path.join("us_silver.json");
-    let cmudict_path = model_dir_path.join("cmudict.dict");
-    let lexicon = Lexicon::load(&gold_path, &silver_path, &cmudict_path)?;
-
-    Ok(KokoroTTS {
-        session,
-        vocab: build_vocab(),
-        lexicon,
-        model_dir: model_dir_path,
-    })
-}
-
-impl KokoroTTS {
-    /// Synthesize a single pre-split text chunk. Caller is responsible for sentence splitting.
-    /// This avoids double-splitting when the TTS pipeline has already split the text.
-    ///
-    /// - `_lang` is accepted for API compatibility but currently unused
-    ///   (Kokoro v1.0 language is selected by voice name prefix, e.g. `af_*`).
-    /// - `_steps` is accepted for API compatibility but currently unused
-    ///   (Kokoro is not diffusion-based).
-    pub fn synth_chunk(
-        &mut self,
-        text: &str,
-        _lang: &str,
-        style: &VoiceStyle,
-        _steps: usize,
-        speed: f32,
-    ) -> Result<Vec<f32>, String> {
-        // G2P: text → IPA phoneme string
-        let ipa = self.lexicon.text_to_ipa(text);
-
-        // Tokenize: IPA → int64 ids with BOS/EOS pad tokens
-        let token_ids = tokenize(&ipa, &self.vocab);
-
-        // Style vector is indexed by phoneme count (excluding the 2 pad tokens).
-        // Shape expected by model: [1, 256] (kokoro-js uses [1, 256], not [1, 1, 256]).
-        let phoneme_count = token_ids.len() - 2;
-        let style_slice = style.get(phoneme_count);
-
-        // Build ONNX input tensors.
-        let seq_len = token_ids.len();
-        let input_ids_arr = Array2::from_shape_vec((1, seq_len), token_ids)
-            .map_err(|e| format!("input_ids shape: {e}"))?;
-        let input_ids_val =
-            Value::from_array(input_ids_arr).map_err(|e| format!("input_ids Value: {e}"))?;
-
-        // Style: [1, 256]. Research notes [1, 1, 256] but kokoro-js uses [1, 256].
-        let style_arr = Array2::from_shape_vec((1, 256), style_slice.to_vec())
-            .map_err(|e| format!("style shape: {e}"))?;
-        let style_val = Value::from_array(style_arr).map_err(|e| format!("style Value: {e}"))?;
-
-        let speed_arr = Array1::from_vec(vec![speed]);
-        let speed_val = Value::from_array(speed_arr).map_err(|e| format!("speed Value: {e}"))?;
-
-        // Run inference. Output[0] = waveform float32[1, N_samples].
-        let outputs = self
-            .session
-            .run(ort::inputs! {
-                "input_ids" => &input_ids_val,
-                "style"     => &style_val,
-                "speed"     => &speed_val,
-            })
-            .map_err(|e| format!("onnx run: {e}"))?;
-
-        let (_, waveform) = outputs[0]
-            .try_extract_tensor::<f32>()
-            .map_err(|e| format!("extract waveform: {e}"))?;
-
-        Ok(waveform.to_vec())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // ── Tokenizer ─────────────────────────────────────────────────────────
-
-    #[test]
-    fn tokenize_empty_produces_bos_eos() {
-        let vocab = build_vocab();
-        let ids = tokenize("", &vocab);
-        assert_eq!(ids, vec![0, 0]); // BOS + EOS only
-    }
-
-    #[test]
-    fn tokenize_known_chars() {
-        let vocab = build_vocab();
-        let ids = tokenize("a", &vocab);
-        // 'a' maps to 43 in the vocab
-        assert_eq!(ids, vec![0, 43, 0]);
-    }
-
-    #[test]
-    fn tokenize_unknown_chars_dropped() {
-        let vocab = build_vocab();
-        let ids = tokenize("🎉", &vocab);
-        // Emoji not in vocab — should be dropped, leaving only BOS+EOS
-        assert_eq!(ids, vec![0, 0]);
-    }
-
-    #[test]
-    fn tokenize_respects_max_length() {
-        let vocab = build_vocab();
-        let long_input: String = "a".repeat(600); // exceeds MAX_PHONEME_TOKENS (510)
-        let ids = tokenize(&long_input, &vocab);
-        // Should be clamped: BOS + 510 tokens + EOS = 512
-        assert_eq!(ids.len(), 512);
-        assert_eq!(ids[0], 0); // BOS
-        assert_eq!(*ids.last().unwrap(), 0); // EOS
-    }
-
-    // ── ARPAbet conversion ────────────────────────────────────────────────
-
-    #[test]
-    fn arpabet_simple_word() {
-        // "HH AH0 L OW1" = hello
-        let ipa = arpabet_line_to_ipa("HH AH0 L OW1");
-        assert_eq!(ipa, "həlˈO");
-    }
-
-    #[test]
-    fn arpabet_stress_markers() {
-        // Primary stress before vowel, secondary stress before vowel
-        let ipa = arpabet_line_to_ipa("K R IY0 EY1 T");
-        // IY0 = unstressed 'i', EY1 = primary 'A'
-        assert!(ipa.contains('ˈ'), "should contain primary stress: {ipa}");
-    }
-
-    #[test]
-    fn arpabet_schwa() {
-        // AH0 should produce schwa (ə), not ʌ
-        let ipa = arpabet_line_to_ipa("AH0");
-        assert_eq!(ipa, "ə");
-    }
-
-    #[test]
-    fn arpabet_unstressed_er() {
-        // ER0 should produce əɹ
-        let ipa = arpabet_line_to_ipa("ER0");
-        assert_eq!(ipa, "əɹ");
-    }
-
-    // ── Letter IPA ────────────────────────────────────────────────────────
-
-    #[test]
-    fn letter_ipa_covers_alphabet() {
-        for c in 'a'..='z' {
-            let ipa = letter_ipa(c);
-            assert!(!ipa.is_empty(), "letter_ipa('{c}') returned empty");
-        }
-    }
-
-    #[test]
-    fn letter_ipa_non_alpha_empty() {
-        assert_eq!(letter_ipa('1'), "");
-        assert_eq!(letter_ipa('!'), "");
-    }
-
-    // ── Punctuation passthrough ───────────────────────────────────────────
-
-    #[test]
-    fn passthrough_punct_includes_expected() {
-        assert!(is_passthrough_punct('.'));
-        assert!(is_passthrough_punct('!'));
-        assert!(is_passthrough_punct('?'));
-        assert!(is_passthrough_punct(' '));
-        assert!(is_passthrough_punct(','));
-    }
-
-    #[test]
-    fn passthrough_punct_excludes_alpha() {
-        assert!(!is_passthrough_punct('a'));
-        assert!(!is_passthrough_punct('Z'));
-    }
-
-    // ── VoiceStyle ────────────────────────────────────────────────────────
-
-    #[test]
-    fn voice_style_get_clamps_to_last_row() {
-        // 2 rows of 256 floats
-        let data: Vec<f32> = (0..512).map(|i| i as f32).collect();
-        let style = VoiceStyle { data };
-        // Row 0
-        assert_eq!(style.get(0)[0], 0.0);
-        // Row 1
-        assert_eq!(style.get(1)[0], 256.0);
-        // Row 999 should clamp to row 1 (last available)
-        assert_eq!(style.get(999)[0], 256.0);
-    }
-
-    #[test]
-    fn load_voice_style_rejects_too_small() {
-        use std::io::Write;
-        let dir = std::env::temp_dir().join("kokoro_test_small");
-        let _ = std::fs::create_dir_all(&dir);
-        let path = dir.join("tiny.bin");
-        let mut f = std::fs::File::create(&path).unwrap();
-        // Write only 100 floats (need at least 256)
-        for i in 0..100u32 {
-            f.write_all(&(i as f32).to_le_bytes()).unwrap();
-        }
-        drop(f);
-        let result = load_voice_style(&path);
-        assert!(result.is_err(), "should reject file with < 256 floats");
-        assert!(result.unwrap_err().contains("too small"));
-        let _ = std::fs::remove_dir_all(&dir);
-    }
-
-    #[test]
-    fn load_voice_style_rejects_non_multiple_of_256() {
-        use std::io::Write;
-        let dir = std::env::temp_dir().join("kokoro_test_nonaligned");
-        let _ = std::fs::create_dir_all(&dir);
-        let path = dir.join("nonaligned.bin");
-        let mut f = std::fs::File::create(&path).unwrap();
-        // Write 257 floats — not a multiple of 256 (remainder = 1)
-        for i in 0..257u32 {
-            f.write_all(&(i as f32).to_le_bytes()).unwrap();
-        }
-        drop(f);
-        let result = load_voice_style(&path);
-        assert!(
-            result.is_err(),
-            "should reject file with non-multiple-of-256 floats"
-        );
-        let err = result.unwrap_err();
-        assert!(
-            err.contains("257"),
-            "error should mention float count: {err}"
-        );
-        assert!(
-            err.contains("remainder"),
-            "error should mention remainder: {err}"
-        );
-        let _ = std::fs::remove_dir_all(&dir);
-    }
-
-    // ── Suffix rules ──────────────────────────────────────────────────────
-
-    #[test]
-    fn apply_s_voiceless() {
-        // After voiceless consonants: +s
-        assert!(Lexicon::apply_s("kæt").ends_with('s'));
-    }
-
-    #[test]
-    fn apply_s_sibilant() {
-        // After sibilants: +ᵻz
-        assert!(Lexicon::apply_s("bʌz").ends_with("ᵻz"));
-    }
-
-    #[test]
-    fn apply_s_voiced() {
-        // After voiced consonants: +z
-        assert!(Lexicon::apply_s("dɔɡ").ends_with('z'));
-    }
-
-    #[test]
-    fn apply_ed_voiceless() {
-        // After voiceless: +t
-        assert!(Lexicon::apply_ed("wɔk").ends_with('t'));
-    }
-
-    #[test]
-    fn apply_ed_d_ending() {
-        // After d: +ᵻd
-        assert!(Lexicon::apply_ed("æd").ends_with("ᵻd"));
-    }
-
-    #[test]
-    fn apply_ing_basic() {
-        assert!(Lexicon::apply_ing("rʌn").ends_with("ɪŋ"));
-    }
-
-    #[test]
-    fn hyphenated_word_splits_into_parts() {
-        let lex = Lexicon {
-            misaki: HashMap::new(),
-            cmudict: HashMap::new(),
-        };
-        let result = lex.word_to_ipa("short-and-natural");
-        let space_count = result.matches(' ').count();
-        assert_eq!(
-            space_count, 2,
-            "expected 2 spaces for 3 hyphenated parts, got {space_count}: {result}"
-        );
-    }
-
-    #[test]
-    fn underscored_word_splits_into_parts() {
-        let lex = Lexicon {
-            misaki: HashMap::new(),
-            cmudict: HashMap::new(),
-        };
-        let result = lex.word_to_ipa("parent_event_id");
-        let space_count = result.matches(' ').count();
-        assert_eq!(
-            space_count, 2,
-            "expected 2 spaces for 3 underscored parts, got {space_count}: {result}"
-        );
-    }
-
-    #[test]
-    fn compound_word_with_dict_lookup() {
-        let mut dict = HashMap::new();
-        dict.insert("parent".to_string(), "pɛɹənt".to_string());
-        dict.insert("event".to_string(), "ɪvɛnt".to_string());
-        dict.insert("id".to_string(), "aɪdiː".to_string());
-        let lex = Lexicon {
-            misaki: dict,
-            cmudict: HashMap::new(),
-        };
-        // Underscore compound
-        let result = lex.word_to_ipa("parent_event_id");
-        assert!(result.contains("pɛɹənt"), "parent not resolved: {result}");
-        assert!(result.contains("ɪvɛnt"), "event not resolved: {result}");
-        assert!(result.contains("aɪdiː"), "id not resolved: {result}");
-
-        // Hyphen compound
-        let result = lex.word_to_ipa("short-and-sweet");
-        assert_eq!(result.matches(' ').count(), 2, "hyphen split: {result}");
-    }
-}
diff --git a/desktop/src-tauri/src/huddle/mod.rs b/desktop/src-tauri/src/huddle/mod.rs
index d9bede659..c1136cada 100644
--- a/desktop/src-tauri/src/huddle/mod.rs
+++ b/desktop/src-tauri/src/huddle/mod.rs
@@ -25,9 +25,9 @@
 
 pub mod agents;
 pub mod audio_output;
-pub mod kokoro;
 pub mod models;
 pub mod pipeline;
+pub mod pocket;
 pub mod preprocessing;
 pub mod relay_api;
 pub mod state;
@@ -661,12 +661,12 @@ pub async fn check_pipeline_hotstart(state: State<'_, AppState>) -> Result<(), S
     let stt_ready = models::global_model_manager()
         .map(|m| m.take_stt_ready())
         .unwrap_or(false);
-    let kokoro_ready = models::global_model_manager()
-        .map(|m| m.take_kokoro_ready())
+    let tts_ready = models::global_model_manager()
+        .map(|m| m.take_tts_ready())
         .unwrap_or(false);
 
     // Start TTS first (so STT can capture tts_cancel).
-    if !has_tts && (kokoro_ready || models::is_kokoro_ready()) {
+    if !has_tts && (tts_ready || models::is_tts_ready()) {
         if let Err(e) = maybe_start_tts_pipeline(&state).await {
             eprintln!("sprout-desktop: TTS hotstart failed: {e}");
         }
@@ -751,7 +751,7 @@ pub async fn start_stt_pipeline(state: State<'_, AppState>) -> Result<(), String
     }
 }
 
-/// Trigger a background download of voice models (Parakeet STT + Kokoro TTS).
+/// Trigger a background download of voice models (Parakeet STT + Pocket TTS).
 ///
 /// Returns immediately — downloads run in tokio background tasks.
 /// Poll `get_model_status` to track progress.
@@ -761,7 +761,7 @@ pub async fn download_voice_models(state: State<'_, AppState>) -> Result<(), Str
     let manager = models::global_model_manager()
         .ok_or("model manager unavailable (home directory could not be resolved)")?;
     manager.start_stt_download(state.http_client.clone());
-    manager.start_kokoro_download(state.http_client.clone());
+    manager.start_tts_download(state.http_client.clone());
     Ok(())
 }
 
@@ -772,14 +772,14 @@ pub fn get_model_status(_state: State<'_, AppState>) -> Result<models::VoiceMode
         .ok_or("model manager unavailable (home directory could not be resolved)")?;
     Ok(models::VoiceModelStatus {
         stt: manager.stt_status(),
-        kokoro: manager.kokoro_status(),
+        tts: manager.tts_status(),
     })
 }
 
 /// Enable or disable TTS output.
 ///
 /// When disabled, the TTS pipeline is shut down and audio output stops.
-/// When re-enabled, the pipeline is restarted if Kokoro models are available.
+/// When re-enabled, the pipeline is restarted if TTS models are available.
 ///
 /// Takes the pipeline handle out of the lock before calling shutdown() — the
 /// thread join in Drop can block for ~200 ms (ONNX inference) and we don't
diff --git a/desktop/src-tauri/src/huddle/models.rs b/desktop/src-tauri/src/huddle/models.rs
index f4f255ec0..8e562762c 100644
--- a/desktop/src-tauri/src/huddle/models.rs
+++ b/desktop/src-tauri/src/huddle/models.rs
@@ -1,10 +1,10 @@
-//! Model download manager for STT (Parakeet TDT-CTC 110M) and TTS (Kokoro) models.
+//! Model download manager for STT (Parakeet TDT-CTC 110M) and TTS (Pocket TTS) models.
 //!
 //! Mental model:
 //!   app launch → start_stt_download (background) → ~/.sprout/models/parakeet-tdt-ctc-110m-en/
-//!   app launch → start_kokoro_download (background) → ~/.sprout/models/kokoro/
+//!   app launch → start_tts_download (background) → ~/.sprout/models/pocket-tts/
 //!   STT pipeline → is_stt_ready() → stt_model_dir() → run inference
-//!   TTS pipeline → is_kokoro_ready() → kokoro_model_dir() → run synthesis
+//!   TTS pipeline → is_tts_ready() → tts_model_dir() → run synthesis
 //!
 //! Models are downloaded once and cached. A version manifest (`.sprout-model-manifest`)
 //! is written alongside model files — if the on-disk version doesn't match the
@@ -38,18 +38,31 @@ use sha2::{Digest, Sha256};
 /// Computed from a known-good download. Update when upgrading model versions.
 const STT_ARCHIVE_SHA256: &str = "17f945007b52ccd8b7200ffc7c5652e9e8e961dfdf479cefcabd06cf5703630b";
 
-/// SHA-256 hashes for individual Kokoro model files.
-/// Computed from known-good downloads. Update when upgrading model versions.
+/// HuggingFace base URL for the sherpa-onnx Pocket TTS int8 repackage.
 ///
-/// model.onnx (model_q8f16.onnx, 86 MB):
-///   curl -sL "https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/onnx/model_q8f16.onnx" | shasum -a 256
+/// Pinned to commit e715955cf50d18d919d37231513c0e914b83661a
+/// (2026-02-10) for reproducible downloads.
+const POCKET_HF_BASE: &str =
+    "https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26/resolve/e715955cf50d18d919d37231513c0e914b83661a";
+
+/// Reference voice WAV from KevinAHM's Pocket TTS ONNX export, pinned to
+/// commit 58a6d00cf13d239b6748cb0769f35c580a8f606c.
+const POCKET_REFERENCE_WAV_URL: &str =
+    "https://huggingface.co/KevinAHM/pocket-tts-onnx/resolve/58a6d00cf13d239b6748cb0769f35c580a8f606c/reference_sample.wav";
+
+/// SHA-256 hashes for individual Pocket TTS model files.
+/// Computed from known-good pinned downloads. Update when upgrading model versions.
 #[rustfmt::skip]
-const KOKORO_FILE_HASHES: &[(&str, &str)] = &[
-    ("model.onnx",    "04c658aec1b6008857c2ad10f8c589d4180d0ec427e7e6118ceb487e215c3cd0"),
-    ("af_heart.bin",  "d583ccff3cdca2f7fae535cb998ac07e9fcb90f09737b9a41fa2734ec44a8f0b"),
-    ("us_gold.json",   "dc414872a49a28ae6c141463d502fd945f3b2fde040484fdc47d00cc4612686f"),
-    ("us_silver.json", "de8f67be911bb6c659187b4a65fd966b6a30e56350e0f790d763210b053ac475"),
-    ("cmudict.dict",   "81917843c7f44ce2b094ac63873c2c7a4cf802040792c455ba3ca406891c3d22"),
+const TTS_FILE_HASHES: &[(&str, &str)] = &[
+    ("decoder.int8.onnx",     "12b0857402d31aead94df19d6783b4350d1f740e811f3a3202c70ad89ae11eea"),
+    ("encoder.onnx",          "e8f2f6d301ffb96e398b138a7dc6d3038622d236044636b73d920bab85890260"),
+    ("lm_flow.int8.onnx",     "8d627d235c44a597da908e1085ebe241cbbe358964c502c5a5063d18851a5529"),
+    ("lm_main.int8.onnx",     "bfc0c7e7e3d72864fa3bb2ee499f62f21ddc1474b885f5f3ca570f8be73e787e"),
+    ("text_conditioner.onnx", "0b84e837d7bfaf2c896627b03e3f080320309f37f4fc7df7698c644f7ba5e6b1"),
+    ("vocab.json",            "6fb646346cf931016f70c4921aab0900ce7a304b893cb02135c74e294abfea01"),
+    ("token_scores.json",     "5be2f278caf9b9800741f0fd82bff677f4943ec764c356f907213434b622d958"),
+    ("LICENSE",               "fe7b4ce83b8381cc5b216bbb4af73c570688d1b819c73bbaed8ca401f4677cd6"),
+    ("reference_sample.wav",  "88fbb0d31ec26674e97e531a71758cabe4e0e4e5b5a18dafa783021a7f5c9366"),
 ];
 
 // ── Model versioning ──────────────────────────────────────────────────────────
@@ -65,8 +78,8 @@ const KOKORO_FILE_HASHES: &[(&str, &str)] = &[
 /// honest (each version tag identifies one specific set of model bytes).
 const STT_MODEL_VERSION: &str = "2";
 
-/// Model manifest version for Kokoro. Increment when upgrading model files.
-const KOKORO_MODEL_VERSION: &str = "1";
+/// Model manifest version for Pocket TTS. Increment when upgrading model files.
+const TTS_MODEL_VERSION: &str = "1";
 
 /// Filename for the version manifest written alongside model files.
 const MANIFEST_FILENAME: &str = ".sprout-model-manifest";
@@ -76,8 +89,8 @@ const MANIFEST_FILENAME: &str = ".sprout-model-manifest";
 /// Maximum expected STT archive size (200 MB — actual is ~100 MB).
 const MAX_STT_DOWNLOAD_BYTES: u64 = 200 * 1024 * 1024;
 
-/// Maximum expected Kokoro file size (200 MB per file — model is 86 MB).
-const MAX_KOKORO_FILE_BYTES: u64 = 200 * 1024 * 1024;
+/// Maximum expected Pocket TTS file size (200 MB per file — largest is ~73 MB).
+const MAX_TTS_FILE_BYTES: u64 = 200 * 1024 * 1024;
 
 /// NVIDIA Parakeet TDT-CTC 110M (English, int8) — packaged for sherpa-onnx by
 /// k2-fsa. Single ONNX file (CTC head) + tokens.txt. Avg WER ~7.5% across
@@ -125,35 +138,49 @@ Provided \"AS IS\", without warranty of any kind, express or implied. See the
 license text for full warranty disclaimer.
 ";
 
-// ── Kokoro TTS model ─────────────────────────────────────────────────────────
+// ── Pocket TTS model ──────────────────────────────────────────────────────────
 
-/// HuggingFace base URL for Kokoro ONNX model files.
-const KOKORO_HF_BASE: &str =
-    "https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main";
+/// Final directory name under `~/.sprout/models/`.
+const TTS_MODEL_DIR_NAME: &str = "pocket-tts";
 
-/// Misaki G2P lexicons — pinned to commit fba1236 for reproducibility.
-/// Gold = curated pronunciations. Silver = broader coverage (93K words).
-/// Both are needed: gold is checked first, silver catches common words gold misses.
-const KOKORO_LEXICON_GOLD_URL: &str =
-    "https://raw.githubusercontent.com/hexgrad/misaki/fba1236/misaki/data/us_gold.json";
-const KOKORO_LEXICON_SILVER_URL: &str =
-    "https://raw.githubusercontent.com/hexgrad/misaki/fba1236/misaki/data/us_silver.json";
+/// Attribution sidecar written next to the Pocket TTS model files.
+const TTS_LICENSE_FILE_NAME: &str = "MODEL_LICENSE.txt";
 
-/// CMU Pronouncing Dictionary — 135K entries including inflected forms.
-/// BSD 2-Clause license (Carnegie Mellon University). Compatible with Apache-2.0.
-const KOKORO_CMUDICT_URL: &str =
-    "https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict";
+/// CC-BY-4.0 §3(a)(1) attribution block for Pocket TTS and its ONNX packaging.
+const TTS_LICENSE_TEXT: &str = "\
+Pocket TTS
+© Kyutai.
 
-/// Final directory name under `~/.sprout/models/`.
-const KOKORO_MODEL_DIR_NAME: &str = "kokoro";
-
-/// All files that must be present for Kokoro to be considered ready.
-const KOKORO_EXPECTED_FILES: &[&str] = &[
-    "model.onnx",
-    "af_heart.bin",
-    "us_gold.json",
-    "us_silver.json",
-    "cmudict.dict",
+Licensed under the Creative Commons Attribution 4.0 International License
+(CC-BY-4.0). License text: https://creativecommons.org/licenses/by/4.0/
+
+Original model by Kyutai: https://huggingface.co/kyutai/pocket-tts
+Paper: Charles, Roebel, et al., Pocket TTS (arXiv:2509.06926).
+Mimi neural codec by Kyutai is bundled as part of the model.
+
+ONNX export by KevinAHM: https://huggingface.co/KevinAHM/pocket-tts-onnx
+Sherpa-onnx repackage by csukuangfj / k2-fsa:
+https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26
+
+Sprout ships the ONNX/model artifacts and reference_sample.wav unmodified,
+renamed only by placement in the local model directory.
+
+Provided \"AS IS\", without warranty of any kind, express or implied. See the
+license text for full warranty disclaimer.
+";
+
+/// All files that must be present for Pocket TTS to be considered ready.
+const TTS_EXPECTED_FILES: &[&str] = &[
+    "decoder.int8.onnx",
+    "encoder.onnx",
+    "lm_flow.int8.onnx",
+    "lm_main.int8.onnx",
+    "text_conditioner.onnx",
+    "vocab.json",
+    "token_scores.json",
+    "LICENSE",
+    "reference_sample.wav",
+    TTS_LICENSE_FILE_NAME,
 ];
 
 // ── Status types ──────────────────────────────────────────────────────────────
@@ -176,7 +203,7 @@ pub enum ModelStatus {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct VoiceModelStatus {
     pub stt: ModelStatus,
-    pub kokoro: ModelStatus,
+    pub tts: ModelStatus,
 }
 
 // ── Safe archive extraction ───────────────────────────────────────────────────
@@ -339,7 +366,7 @@ where
 
 // ── ModelSlot ─────────────────────────────────────────────────────────────────
 
-/// Per-model state + config. `ModelManager` owns two of these (stt, kokoro).
+/// Per-model state + config. `ModelManager` owns two of these (stt, tts).
 #[derive(Clone)]
 struct ModelSlot {
     dir_name: &'static str,                  // subdir under ~/.sprout/models/
@@ -493,7 +520,7 @@ pub struct ModelManager {
     /// `~/.sprout/models/`
     models_dir: PathBuf,
     stt: ModelSlot,
-    kokoro: ModelSlot,
+    tts: ModelSlot,
 }
 
 impl ModelManager {
@@ -505,11 +532,7 @@ impl ModelManager {
         Some(Self {
             models_dir,
             stt: ModelSlot::new(STT_MODEL_DIR_NAME, STT_EXPECTED_FILES, STT_MODEL_VERSION),
-            kokoro: ModelSlot::new(
-                KOKORO_MODEL_DIR_NAME,
-                KOKORO_EXPECTED_FILES,
-                KOKORO_MODEL_VERSION,
-            ),
+            tts: ModelSlot::new(TTS_MODEL_DIR_NAME, TTS_EXPECTED_FILES, TTS_MODEL_VERSION),
         })
     }
 
@@ -532,23 +555,23 @@ impl ModelManager {
         self.stt.take_ready()
     }
 
-    // ── Kokoro accessors ──────────────────────────────────────────────────────
+    // ── TTS accessors ─────────────────────────────────────────────────────────
 
-    /// Path to the Kokoro model directory, or `None` if not ready.
-    pub fn kokoro_model_dir(&self) -> Option<PathBuf> {
-        self.kokoro.dir_if_ready(&self.models_dir)
+    /// Path to the TTS model directory, or `None` if not ready.
+    pub fn tts_model_dir(&self) -> Option<PathBuf> {
+        self.tts.dir_if_ready(&self.models_dir)
     }
-    /// `true` if all Kokoro files are present and the manifest version matches.
-    pub fn is_kokoro_ready(&self) -> bool {
-        self.kokoro.is_ready(&self.models_dir)
+    /// `true` if all TTS files are present and the manifest version matches.
+    pub fn is_tts_ready(&self) -> bool {
+        self.tts.is_ready(&self.models_dir)
     }
-    /// Current Kokoro download status.
-    pub fn kokoro_status(&self) -> ModelStatus {
-        self.kokoro.status()
+    /// Current TTS download status.
+    pub fn tts_status(&self) -> ModelStatus {
+        self.tts.status()
     }
-    /// Returns `true` once when Kokoro just became ready. Resets the flag.
-    pub fn take_kokoro_ready(&self) -> bool {
-        self.kokoro.take_ready()
+    /// Returns `true` once when TTS just became ready. Resets the flag.
+    pub fn take_tts_ready(&self) -> bool {
+        self.tts.take_ready()
     }
 
     // ── Download triggers ─────────────────────────────────────────────────────
@@ -583,14 +606,14 @@ impl ModelManager {
         }
     }
 
-    /// Start a background Kokoro download (~87 MB). No-op if already ready or downloading.
-    pub fn start_kokoro_download(&self, http_client: reqwest::Client) {
+    /// Start a background Pocket TTS download (~189 MB). No-op if already ready or downloading.
+    pub fn start_tts_download(&self, http_client: reqwest::Client) {
         let manager = self.clone();
-        self.kokoro.start_download(
+        self.tts.start_download(
             &self.models_dir,
             http_client,
-            "kokoro",
-            move |client| async move { manager.download_kokoro_model(client).await },
+            "tts",
+            move |client| async move { manager.download_tts_model(client).await },
         );
     }
 
@@ -697,40 +720,42 @@ impl ModelManager {
         Ok(())
     }
 
-    /// Download and verify the Kokoro TTS model files from HuggingFace and GitHub.
+    /// Download and verify the Pocket TTS model files from HuggingFace.
     ///
-    /// Downloads files into `~/.sprout/models/kokoro/`:
-    ///   - `model.onnx`   — Kokoro-82M mixed-precision ONNX (86 MB)
-    ///   - `af_heart.bin` — best-quality American English voice embedding (510 KB)
-    ///   - `us_gold.json` — Misaki G2P lexicon, pinned to commit fba1236 (3 MB)
+    /// Downloads files into `~/.sprout/models/pocket-tts/`:
+    ///   - five ONNX sessions (Pocket TTS + Mimi codec)
+    ///   - `vocab.json` / `token_scores.json` for sherpa-onnx text conditioning
+    ///   - upstream `LICENSE` plus Sprout's `MODEL_LICENSE.txt` attribution sidecar
+    ///   - `reference_sample.wav` as the bundled default voice
     ///
     /// Files are written to a temp directory first, then moved atomically.
-    async fn download_kokoro_model(&self, http_client: reqwest::Client) -> Result<(), String> {
+    async fn download_tts_model(&self, http_client: reqwest::Client) -> Result<(), String> {
         tokio::fs::create_dir_all(&self.models_dir)
             .await
             .map_err(|e| format!("create models dir: {e}"))?;
 
-        let temp_dir = self.models_dir.join("kokoro.tmp");
+        let temp_dir = self.models_dir.join("pocket-tts.tmp");
         fresh_temp_dir(&temp_dir).await?;
 
-        // (url, local_filename)
-        let downloads: &[(&str, &str)] = &[
-            (
-                &format!("{KOKORO_HF_BASE}/onnx/model_q8f16.onnx"),
-                "model.onnx",
-            ),
-            (
-                &format!("{KOKORO_HF_BASE}/voices/af_heart.bin"),
-                "af_heart.bin",
-            ),
-            (KOKORO_LEXICON_GOLD_URL, "us_gold.json"),
-            (KOKORO_LEXICON_SILVER_URL, "us_silver.json"),
-            (KOKORO_CMUDICT_URL, "cmudict.dict"),
+        let model_files = [
+            "decoder.int8.onnx",
+            "encoder.onnx",
+            "lm_flow.int8.onnx",
+            "lm_main.int8.onnx",
+            "text_conditioner.onnx",
+            "vocab.json",
+            "token_scores.json",
+            "LICENSE",
         ];
+        let mut downloads: Vec<(String, &'static str)> = model_files
+            .iter()
+            .map(|filename| (format!("{POCKET_HF_BASE}/{filename}"), *filename))
+            .collect();
+        downloads.push((POCKET_REFERENCE_WAV_URL.to_string(), "reference_sample.wav"));
         let total_files = downloads.len() as u32;
 
         for (i, (url, filename)) in downloads.iter().enumerate() {
-            eprintln!("sprout-desktop: downloading Kokoro {filename} from {url}");
+            eprintln!("sprout-desktop: downloading Pocket TTS {filename} from {url}");
 
             let response = fetch_url(&http_client, url, filename).await.map_err(|e| {
                 let _ = std::fs::remove_dir_all(&temp_dir);
@@ -738,12 +763,12 @@ impl ModelManager {
             })?;
 
             let dest = temp_dir.join(filename);
-            let slot = self.kokoro.clone();
+            let slot = self.tts.clone();
             let file_index = i as u32;
             let bytes = download_file(
                 response,
                 &dest,
-                MAX_KOKORO_FILE_BYTES,
+                MAX_TTS_FILE_BYTES,
                 filename,
                 |downloaded, content_length| {
                     if let Some(total) = content_length {
@@ -766,30 +791,36 @@ impl ModelManager {
             })?;
             eprintln!("sprout-desktop: downloaded {bytes} bytes ({filename}), wrote to disk");
 
-            // Verify file integrity against pinned hash.
-            if let Some(&(_, expected)) = KOKORO_FILE_HASHES.iter().find(|(n, _)| *n == *filename) {
-                let actual = sha256_file(&dest).await?;
-                if actual != expected {
-                    let _ = tokio::fs::remove_dir_all(&temp_dir).await;
-                    return Err(format!(
-                        "Kokoro {filename} integrity check failed: expected {expected}, got {actual}"
-                    ));
-                }
+            let expected = TTS_FILE_HASHES
+                .iter()
+                .find(|(n, _)| *n == *filename)
+                .map(|(_, hash)| *hash)
+                .ok_or_else(|| format!("missing expected hash for Pocket TTS file: {filename}"))?;
+            let actual = sha256_file(&dest).await?;
+            if actual != expected {
+                let _ = tokio::fs::remove_dir_all(&temp_dir).await;
+                return Err(format!(
+                    "Pocket TTS {filename} integrity check failed: expected {expected}, got {actual}"
+                ));
             }
 
             // Ensure progress reflects file completion even without content-length.
             let pct = (((i as u32 + 1) * 89) / total_files).min(89) as u8;
-            self.kokoro.set_status(ModelStatus::Downloading {
+            self.tts.set_status(ModelStatus::Downloading {
                 progress_percent: pct,
             });
         }
 
-        self.kokoro.set_status(ModelStatus::Downloading {
+        tokio::fs::write(temp_dir.join(TTS_LICENSE_FILE_NAME), TTS_LICENSE_TEXT)
+            .await
+            .map_err(|e| format!("write TTS model license sidecar: {e}"))?;
+
+        self.tts.set_status(ModelStatus::Downloading {
             progress_percent: 90,
         });
 
         if let Err(e) = self
-            .kokoro
+            .tts
             .verify_and_install(&self.models_dir, &temp_dir, None)
             .await
         {
@@ -798,8 +829,8 @@ impl ModelManager {
         }
 
         eprintln!(
-            "sprout-desktop: Kokoro model ready at {}",
-            self.kokoro.model_dir(&self.models_dir).display()
+            "sprout-desktop: Pocket TTS model ready at {}",
+            self.tts.model_dir(&self.models_dir).display()
         );
         Ok(())
     }
@@ -856,14 +887,37 @@ async fn cleanup_legacy_moonshine_dir(models_dir: &Path) {
     }
 }
 
-/// Path to the Kokoro model directory, or `None` if not ready.
-pub fn kokoro_model_dir() -> Option<PathBuf> {
-    global_model_manager()?.kokoro_model_dir()
+/// Path to the TTS model directory, or `None` if not ready.
+pub fn tts_model_dir() -> Option<PathBuf> {
+    global_model_manager()?.tts_model_dir()
 }
 
-/// `true` if all expected Kokoro model files are present on disk.
-pub fn is_kokoro_ready() -> bool {
+/// `true` if all expected TTS model files are present on disk.
+pub fn is_tts_ready() -> bool {
     global_model_manager()
-        .map(|m| m.is_kokoro_ready())
+        .map(|m| m.is_tts_ready())
         .unwrap_or(false)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tts_readiness_requires_license_sidecar() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let slot = ModelSlot::new(TTS_MODEL_DIR_NAME, TTS_EXPECTED_FILES, TTS_MODEL_VERSION);
+        let model_dir = temp.path().join(TTS_MODEL_DIR_NAME);
+        std::fs::create_dir_all(&model_dir).expect("create model dir");
+
+        for file in TTS_EXPECTED_FILES {
+            std::fs::write(model_dir.join(file), b"test").expect("write expected file");
+        }
+        std::fs::write(model_dir.join(MANIFEST_FILENAME), TTS_MODEL_VERSION).expect("manifest");
+
+        assert!(slot.is_ready(temp.path()));
+
+        std::fs::remove_file(model_dir.join(TTS_LICENSE_FILE_NAME)).expect("remove sidecar");
+        assert!(!slot.is_ready(temp.path()));
+    }
+}
diff --git a/desktop/src-tauri/src/huddle/pipeline.rs b/desktop/src-tauri/src/huddle/pipeline.rs
index 4b1c0eb76..5ff484739 100644
--- a/desktop/src-tauri/src/huddle/pipeline.rs
+++ b/desktop/src-tauri/src/huddle/pipeline.rs
@@ -41,7 +41,7 @@ pub(crate) async fn post_connect_setup(
     // Ensure voice models are downloading (idempotent).
     if let Some(mgr) = models::global_model_manager() {
         mgr.start_stt_download(state.http_client.clone());
-        mgr.start_kokoro_download(state.http_client.clone());
+        mgr.start_tts_download(state.http_client.clone());
     }
 
     // Connect audio relay WebSocket (Opus encode/decode pipeline).
@@ -153,7 +153,7 @@ pub(crate) async fn maybe_start_stt_pipeline(
     Ok(true)
 }
 
-/// Attempt to start the TTS pipeline if Kokoro models are present and TTS is enabled.
+/// Attempt to start the TTS pipeline if TTS models are present and TTS is enabled.
 ///
 /// Returns `Ok(true)` if the pipeline was started, `Ok(false)` if preconditions
 /// aren't met (model not ready, pipeline exists, TTS disabled), or `Err` on failure.
@@ -164,11 +164,11 @@ pub(crate) async fn maybe_start_stt_pipeline(
 /// leaks ~200MB of ONNX sessions. The sentinel is set under the lock before
 /// releasing it for the expensive construction step.
 pub(crate) async fn maybe_start_tts_pipeline(state: &AppState) -> Result<bool, String> {
-    if !models::is_kokoro_ready() {
-        return Ok(false); // Kokoro not downloaded yet — TTS unavailable.
+    if !models::is_tts_ready() {
+        return Ok(false); // TTS model not downloaded yet — TTS unavailable.
     }
 
-    let model_dir = match models::kokoro_model_dir() {
+    let model_dir = match models::tts_model_dir() {
         Some(d) => d,
         None => return Ok(false),
     };
diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
new file mode 100644
index 000000000..c90dd8719
--- /dev/null
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -0,0 +1,238 @@
+//! Pocket TTS engine wrapper around sherpa-onnx's `OfflineTts`.
+//!
+//! Pocket TTS is a small (~189 MB int8 ONNX) zero-shot voice-cloning TTS
+//! model from Kyutai. It runs quickly on CPU via sherpa-onnx, replacing the
+//! previous Kokoro-82M engine that also required an espeak-free but
+//! lexicon-heavy G2P pipeline (Misaki + CMUdict).
+//!
+//! ## Attribution
+//!
+//! - **Model**: Kyutai *Pocket TTS* — Charles, Roebel, et al., 2026.
+//!   arXiv:2509.06926. Original repository: <https://huggingface.co/kyutai/pocket-tts>.
+//!   Licensed CC-BY-4.0.
+//! - **Mimi neural codec**: Kyutai, bundled in the same release. CC-BY-4.0.
+//! - **ONNX export**: KevinAHM —
+//!   <https://huggingface.co/KevinAHM/pocket-tts-onnx>. CC-BY-4.0.
+//!   Provides the reference voice WAV (`reference_sample.wav`).
+//! - **sherpa-onnx repackage**: csukuangfj / k2-fsa —
+//!   <https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26>.
+//!   Repackages KevinAHM's export with the file layout sherpa-onnx's
+//!   `OfflineTtsPocketModelConfig` expects. CC-BY-4.0.
+//!
+//! Sprout ships these files unmodified; see the on-disk `MODEL_LICENSE.txt`
+//! sidecar written by `huddle::models` during install for the canonical
+//! CC-BY-4.0 §3(a)(1) attribution block.
+//!
+//! ## Engine-module contract (see `huddle::tts`)
+//!
+//! `pocket.rs` exposes a fixed surface used by `tts.rs`. Mirroring this
+//! contract is what lets the TTS pipeline stay engine-agnostic:
+//!
+//! - `SAMPLE_RATE: u32`             — engine output sample rate in Hz.
+//! - `DEFAULT_VOICE: &str`          — default voice name (without extension).
+//! - `VOICE_FILE_EXT: &str`         — extension for per-voice files on disk.
+//! - `load_text_to_speech(model_dir)`              → `Result<Engine, String>`
+//! - `load_voice_style(path)`                      → `Result<VoiceStyle, String>`
+//! - `Engine::synth_chunk(&self, text, lang, &VoiceStyle, steps, speed)`
+//!                                                 → `Result<Vec<f32>, String>`
+//!
+//! `lang` and `steps` are accepted for API compatibility with the previous
+//! Kokoro engine but are unused — Pocket TTS does its own language ID from
+//! the input text and is not a diffusion model (consistency LM, one step).
+
+use std::path::{Path, PathBuf};
+
+use sherpa_onnx::{GenerationConfig, OfflineTts, OfflineTtsConfig, Wave};
+
+// ── Engine-module contract: public consts ─────────────────────────────────────
+
+/// Pocket TTS emits 24 kHz mono PCM. Matches the previous Kokoro output rate,
+/// so the rodio sink and inter-sentence silence buffer in `tts.rs` remain valid.
+pub const SAMPLE_RATE: u32 = 24_000;
+
+/// Name (without extension) of the bundled reference voice. The model directory
+/// is expected to contain `<DEFAULT_VOICE>.<VOICE_FILE_EXT>` after install.
+pub const DEFAULT_VOICE: &str = "reference_sample";
+
+/// Voice files for Pocket TTS are reference audio (WAV). Distinct from the
+/// Kokoro `.bin` style vectors — the model conditions on raw waveform samples,
+/// not a precomputed embedding, so the extension change is honest.
+pub const VOICE_FILE_EXT: &str = "wav";
+
+// ── Tuning ────────────────────────────────────────────────────────────────────
+
+/// Single-threaded ONNX execution for predictable CPU contention with the STT
+/// pipeline. Matches `STT_NUM_THREADS` in `stt.rs`; raise only if a benchmark
+/// argues for it.
+const TTS_NUM_THREADS: i32 = 1;
+
+/// LRU cache size for cloned voice embeddings inside the sherpa-onnx engine.
+/// We bind to one voice per pipeline today, but the upstream example uses 16
+/// and the cost is negligible — keep room for future multi-voice support.
+const VOICE_EMBEDDING_CACHE_CAPACITY: i32 = 16;
+
+/// Pocket TTS is a consistency-based LM. Generation quality saturates at one
+/// denoising step — the upstream `GenerationConfig` default of 5 multiplies
+/// synthesis time by ~5× with no audible benefit on this model.
+const SYNTH_NUM_STEPS: i32 = 1;
+
+/// Disable the upstream default 200 ms of pre/post silence padding. We splice
+/// `INTER_SENTENCE_SILENCE` in `tts.rs` ourselves and don't want a double
+/// helping of leading silence on every utterance.
+const SYNTH_SILENCE_SCALE: f32 = 0.0;
+
+// ── ONNX file names (five Pocket TTS sessions plus two JSON tables) ───────────
+
+const FILE_LM_MAIN: &str = "lm_main.int8.onnx";
+const FILE_LM_FLOW: &str = "lm_flow.int8.onnx";
+const FILE_ENCODER: &str = "encoder.onnx";
+const FILE_DECODER: &str = "decoder.int8.onnx";
+const FILE_TEXT_COND: &str = "text_conditioner.onnx";
+const FILE_VOCAB: &str = "vocab.json";
+const FILE_TOKEN_SCORES: &str = "token_scores.json";
+
+// ── Voice style ───────────────────────────────────────────────────────────────
+
+/// Loaded reference voice — normalised f32 PCM samples plus their sample rate.
+///
+/// Pocket TTS takes a reference waveform per generation call (not a
+/// precomputed style embedding), so we keep the samples in memory and clone
+/// the small `Vec` into each `GenerationConfig` rather than re-reading the
+/// WAV from disk on every sentence.
+#[derive(Debug, Clone)]
+pub struct VoiceStyle {
+    samples: Vec<f32>,
+    sample_rate: i32,
+}
+
+/// Load a reference voice WAV from disk.
+///
+/// Accepts any sample rate sherpa-onnx's `Wave::read` can decode — Pocket TTS
+/// resamples internally using `reference_sample_rate`. The bundled
+/// `reference_sample.wav` is 16 kHz mono.
+pub fn load_voice_style(path: &Path) -> Result<VoiceStyle, String> {
+    let path_str = path
+        .to_str()
+        .ok_or_else(|| format!("voice path is not valid UTF-8: {}", path.display()))?;
+    let wave = Wave::read(path_str)
+        .ok_or_else(|| format!("could not read voice WAV at {}", path.display()))?;
+    let samples = wave.samples().to_vec();
+    if samples.is_empty() {
+        return Err(format!("voice WAV is empty: {}", path.display()));
+    }
+    Ok(VoiceStyle {
+        samples,
+        sample_rate: wave.sample_rate(),
+    })
+}
+
+// ── Engine ────────────────────────────────────────────────────────────────────
+
+/// Pocket TTS engine handle. Cheap to construct (one `OfflineTts::create`
+/// call). Owned by the TTS worker thread for the lifetime of a huddle session.
+///
+/// `OfflineTts` does not implement `Debug`, so we don't derive it here — the
+/// pipeline only needs to move the engine into the worker thread and call
+/// `synth_chunk` on it, never to print it.
+pub struct PocketTts {
+    inner: OfflineTts,
+}
+
+/// Build the Pocket TTS engine from the model directory installed by
+/// `huddle::models`. Returns `Err` if any expected ONNX or JSON file is
+/// missing — readiness is normally enforced by `is_tts_ready` upstream, but
+/// the check is repeated here so a manually-modified model dir produces a
+/// clear error string instead of an opaque sherpa-onnx `None`.
+pub fn load_text_to_speech(model_dir: &str) -> Result<PocketTts, String> {
+    let dir = PathBuf::from(model_dir);
+    for name in [
+        FILE_LM_MAIN,
+        FILE_LM_FLOW,
+        FILE_ENCODER,
+        FILE_DECODER,
+        FILE_TEXT_COND,
+        FILE_VOCAB,
+        FILE_TOKEN_SCORES,
+    ] {
+        let p = dir.join(name);
+        if !p.is_file() {
+            return Err(format!("missing Pocket TTS file: {}", p.display()));
+        }
+    }
+
+    let to_str = |name: &str| -> String { dir.join(name).to_string_lossy().into_owned() };
+
+    // Build the config by mutating defaults — mirrors `stt.rs` and stays
+    // resilient if sherpa-onnx adds unrelated model-family fields.
+    let mut cfg = OfflineTtsConfig::default();
+    cfg.model.pocket.lm_main = Some(to_str(FILE_LM_MAIN));
+    cfg.model.pocket.lm_flow = Some(to_str(FILE_LM_FLOW));
+    cfg.model.pocket.encoder = Some(to_str(FILE_ENCODER));
+    cfg.model.pocket.decoder = Some(to_str(FILE_DECODER));
+    cfg.model.pocket.text_conditioner = Some(to_str(FILE_TEXT_COND));
+    cfg.model.pocket.vocab_json = Some(to_str(FILE_VOCAB));
+    cfg.model.pocket.token_scores_json = Some(to_str(FILE_TOKEN_SCORES));
+    cfg.model.pocket.voice_embedding_cache_capacity = VOICE_EMBEDDING_CACHE_CAPACITY;
+    cfg.model.num_threads = TTS_NUM_THREADS;
+    // Explicit — defaults are not part of the API contract, and noisy debug
+    // logging in release builds would be expensive on every synthesized chunk.
+    cfg.model.debug = false;
+
+    let inner = OfflineTts::create(&cfg)
+        .ok_or_else(|| "OfflineTts::create returned None for Pocket TTS".to_string())?;
+    Ok(PocketTts { inner })
+}
+
+impl PocketTts {
+    /// Synthesise `text` with the given reference voice.
+    ///
+    /// `_lang` and `_steps` are accepted for API compatibility with the
+    /// previous Kokoro engine. Pocket TTS infers language from the input text
+    /// directly and is a one-step consistency model. Returns an empty buffer
+    /// for whitespace-only input.
+    pub fn synth_chunk(
+        &self,
+        text: &str,
+        _lang: &str,
+        style: &VoiceStyle,
+        _steps: usize,
+        speed: f32,
+    ) -> Result<Vec<f32>, String> {
+        if text.trim().is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let cfg = GenerationConfig {
+            speed,
+            num_steps: SYNTH_NUM_STEPS,
+            silence_scale: SYNTH_SILENCE_SCALE,
+            reference_audio: Some(style.samples.clone()),
+            reference_sample_rate: style.sample_rate,
+            ..Default::default()
+        };
+
+        // No progress callback — synthesis is fast enough that returning the
+        // whole buffer at once keeps the lookahead pipelining in `tts.rs`
+        // simple. `None::<fn(...) -> bool>` pins the callback type for the
+        // `generate_with_config` generic parameter.
+        let audio = self
+            .inner
+            .generate_with_config(text, &cfg, None::<fn(&[f32], f32) -> bool>)
+            .ok_or_else(|| {
+                format!(
+                    "Pocket TTS synthesis failed for text ({} chars)",
+                    text.len()
+                )
+            })?;
+
+        let sample_rate = audio.sample_rate();
+        if sample_rate != SAMPLE_RATE as i32 {
+            eprintln!(
+                "sprout-desktop: Pocket TTS returned unexpected sample rate {sample_rate}Hz \
+                 (expected {SAMPLE_RATE}Hz); playback speed may be wrong"
+            );
+        }
+
+        Ok(audio.samples().to_vec())
+    }
+}
diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index 6b5b1b97f..843b76ae5 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -5,7 +5,7 @@
 //! ```text
 //! caller: pipeline.speak("Hello world. How are you?")
 //!   → bounded sync_channel (TEXT_QUEUE_DEPTH = 8)
-//!   → tts_worker thread (owns 1 Kokoro engine)
+//!   → tts_worker thread (owns 1 Pocket TTS engine)
 //!       1. Preprocess text
 //!       2. Split into sentences
 //!       3. Synthesize each sentence individually → f32 PCM
@@ -35,7 +35,7 @@ use std::{
     time::Duration,
 };
 
-use super::kokoro::{load_text_to_speech, load_voice_style, SAMPLE_RATE};
+use super::pocket::{load_text_to_speech, load_voice_style, SAMPLE_RATE, VOICE_FILE_EXT};
 use super::preprocessing::{preprocess_for_tts, split_sentences};
 
 // ── Constants ─────────────────────────────────────────────────────────────────
@@ -48,21 +48,35 @@ const TEXT_QUEUE_DEPTH: usize = 8;
 /// How long the worker waits on the text channel before checking the shutdown flag.
 const RECV_TIMEOUT: Duration = Duration::from_millis(100);
 
-/// Kokoro ignores denoising steps (not a diffusion model). Kept for API compat.
+/// Pocket TTS is a one-step consistency model, not diffusion. Kept for API compat.
 const SYNTH_STEPS: usize = 1;
 
 /// Synthesis speed multiplier. Slightly faster than natural speech.
 const SYNTH_SPEED: f32 = 1.05;
 
-/// Volume boost applied after synthesis — Kokoro output is normalized.
-/// Start at 1.5 and tune empirically.
-const VOLUME_BOOST: f32 = 1.5;
+/// Target peak amplitude after per-sentence loudness normalization, in linear
+/// scale. −6 dBFS = 10^(−6/20) ≈ 0.501. Leaves 6 dB of headroom above the
+/// loudest sample so the subsequent fade-in/out and any system mixer gain
+/// don't have to soft-clip. See `normalize_for_playback`.
+const TARGET_PEAK: f32 = 0.501_187_2; // 10f32.powf(-6.0 / 20.0)
+
+/// Maximum gain applied by `normalize_for_playback`. Caps amplification on
+/// near-silent buffers so a mid-utterance pause or a malformed synth doesn't
+/// get amplified to full scale (which would surface any quantization noise).
+///
+/// Pocket TTS reference-voice output measured ~7.6% peak on a 75-character
+/// utterance (`examples/pocket_bench`); a gain of 1/0.076 ≈ 6.6 lands that
+/// sample at the −6 dBFS target, so `8.0` covers normal utterances while
+/// still catching pathological near-silent buffers.
+const MAX_GAIN: f32 = 8.0;
 
 /// Fade in/out length in samples (8ms at 24kHz ≈ 192 samples).
 /// Eliminates clicks/pops at sentence boundaries.
 const FADE_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.008) as usize;
 
-/// Sentence-by-sentence synthesis for lower TTFA (≈200ms vs ≈600ms for 3-sentence batches).
+/// Sentence-by-sentence synthesis — keeps first-sentence latency low and lets
+/// playback of sentence N overlap with synthesis of sentence N+1 (see the
+/// lookahead pipelining note in the module doc-comment above).
 const BATCH_SIZE: usize = 1;
 
 /// Silence inserted between sentences by the TTS pipeline (seconds).
@@ -87,7 +101,7 @@ pub struct TtsPipeline {
     /// Kept alive here so the Arc isn't dropped — the worker holds a clone.
     #[allow(dead_code)]
     cancel: Arc<AtomicBool>,
-    /// Voice name (e.g. "af_heart"). Stored for future voice-switching support.
+    /// Voice name (e.g. "reference_sample"). Stored for future voice-switching support.
     #[allow(dead_code)]
     voice: String,
     /// Worker thread handle — taken on drop to join cleanly.
@@ -97,8 +111,8 @@ pub struct TtsPipeline {
 impl TtsPipeline {
     /// Spawn the TTS pipeline thread using the default voice.
     ///
-    /// `model_dir` must contain the Kokoro model files:
-    ///   `model_quantized.onnx`, `tokenizer.json`, `voices/<name>.bin`
+    /// `model_dir` must contain the Pocket TTS files declared by `huddle::models`
+    /// (the five ONNX sessions, the two JSON tables, and `<voice>.wav`).
     ///
     /// `tts_active` is set to `true` while audio is playing and `false` when idle.
     /// Pass the same `Arc` to the STT pipeline to gate microphone input.
@@ -112,11 +126,13 @@ impl TtsPipeline {
         cancel: Arc<AtomicBool>,
         output_device: Option<String>,
     ) -> Result<Self, String> {
-        use super::kokoro::DEFAULT_VOICE;
+        use super::pocket::DEFAULT_VOICE;
         Self::new_with_voice(model_dir, tts_active, cancel, DEFAULT_VOICE, output_device)
     }
 
-    /// Spawn the TTS pipeline thread with a specific voice name (e.g. `"af_heart"`, `"am_michael"`).
+    /// Spawn the TTS pipeline thread with a specific voice name. Today only the
+    /// bundled default voice (see `pocket::DEFAULT_VOICE`) is shipped; other
+    /// names will surface a clear error from `load_voice_style`.
     pub fn new_with_voice(
         model_dir: PathBuf,
         tts_active: Arc<AtomicBool>,
@@ -204,14 +220,14 @@ fn tts_worker(
     cancel: Arc<AtomicBool>,
     output_device: Option<String>,
 ) {
-    // ── 1. Initialise Kokoro engine ───────────────────────────────────────────
+    // ── 1. Initialise TTS engine ──────────────────────────────────────────────
     let model_dir_str = model_dir.to_string_lossy().to_string();
 
-    let mut engine = match load_text_to_speech(&model_dir_str) {
+    let engine = match load_text_to_speech(&model_dir_str) {
         Ok(e) => e,
         Err(e) => {
             eprintln!(
-                "sprout-desktop: TTS Kokoro init failed (model_dir={}): {e}. TTS disabled.",
+                "sprout-desktop: TTS engine init failed (model_dir={}): {e}. TTS disabled.",
                 model_dir.display()
             );
             drain_until_shutdown(text_rx, &shutdown);
@@ -220,7 +236,7 @@ fn tts_worker(
     };
 
     // ── 2. Load voice style ───────────────────────────────────────────────────
-    let voice_path = model_dir.join(format!("{voice_name}.bin"));
+    let voice_path = model_dir.join(format!("{voice_name}.{VOICE_FILE_EXT}"));
     let style = match load_voice_style(&voice_path) {
         Ok(s) => s,
         Err(e) => {
@@ -234,9 +250,9 @@ fn tts_worker(
 
     // ── 2b. Warmup inference ─────────────────────────────────────────────────
     // The first ONNX inference on any session is significantly slower than
-    // subsequent ones — it triggers JIT compilation, memory pool allocation,
-    // and (on CoreML) lazy model compilation. Run a short dummy synthesis and
-    // discard the output so the first real utterance runs at warm-session speed.
+    // subsequent ones — it can trigger native session initialization, memory
+    // pool allocation, and graph-specific caches. Run a short dummy synthesis
+    // and discard the output so the first real utterance runs at warm-session speed.
     {
         let t = std::time::Instant::now();
         match engine.synth_chunk("warmup", "en", &style, SYNTH_STEPS, SYNTH_SPEED) {
@@ -368,10 +384,7 @@ fn tts_worker(
 
             match engine.synth_chunk(text, "en", &style, SYNTH_STEPS, SYNTH_SPEED) {
                 Ok(samples) if !samples.is_empty() => {
-                    let mut boosted: Vec<f32> = samples
-                        .iter()
-                        .map(|&s| (s * VOLUME_BOOST).clamp(-1.0, 1.0))
-                        .collect();
+                    let mut boosted = normalize_for_playback(samples);
                     apply_fades(&mut boosted);
                     player.append(SamplesBuffer::new(channels, rate, boosted));
                     // Insert inter-sentence silence after each synthesized chunk.
@@ -439,6 +452,28 @@ fn handle_cancel_or_shutdown(
     false
 }
 
+/// Per-sentence peak normalization. Scales the buffer so its loudest sample
+/// lands at `TARGET_PEAK` (−6 dBFS), capped at `MAX_GAIN` to avoid amplifying
+/// near-silent buffers into quantization noise. Returns the input unchanged on
+/// empty input or pure-silence input.
+///
+/// Why normalize per-sentence rather than apply a fixed boost: Pocket TTS
+/// reference-voice output measured ~7.6% peak on one utterance, but per-
+/// sentence peak distribution isn't known in advance and may vary with text
+/// and any future voice swap. A fixed multiplier either clips loud sentences
+/// or under-amplifies quiet ones; normalization adapts.
+fn normalize_for_playback(samples: Vec<f32>) -> Vec<f32> {
+    let peak = samples.iter().fold(0.0_f32, |acc, &s| acc.max(s.abs()));
+    if peak == 0.0 {
+        return samples;
+    }
+    let gain = (TARGET_PEAK / peak).min(MAX_GAIN);
+    samples
+        .into_iter()
+        .map(|s| (s * gain).clamp(-1.0, 1.0))
+        .collect()
+}
+
 /// Apply a short linear fade-in at the start and fade-out at the end of `samples`.
 ///
 /// Uses `FADE_SAMPLES` (8ms) or half the buffer length, whichever is smaller.
@@ -1025,4 +1060,66 @@ mod tests {
         apply_fades(&mut samples);
         assert_eq!(samples[0], 1.0);
     }
+
+    // ── normalize_for_playback tests ──────────────────────────────────────────
+
+    /// A quiet buffer (peak well under TARGET_PEAK) is scaled up to TARGET_PEAK.
+    /// Reproduces the bench-measured Pocket TTS peak (~0.076) and asserts the
+    /// loudest sample lands at exactly the −6 dBFS target.
+    #[test]
+    fn normalize_for_playback_hits_target_on_quiet_buffer() {
+        // peak 0.076 ⇒ gain ≈ 6.6, well under MAX_GAIN (8.0), so target hit.
+        let input: Vec<f32> = (0..100).map(|i| 0.076 * (i as f32 / 100.0)).collect();
+        let out = normalize_for_playback(input);
+        let peak = out.iter().fold(0.0_f32, |a, &s| a.max(s.abs()));
+        assert!(
+            (peak - TARGET_PEAK).abs() < 1e-3,
+            "expected peak ~{TARGET_PEAK}, got {peak}"
+        );
+    }
+
+    /// A near-silent buffer would need a huge gain to reach TARGET_PEAK;
+    /// `MAX_GAIN` caps the amplification so we don't bring quantization noise
+    /// up to full scale.
+    #[test]
+    fn normalize_for_playback_caps_gain_on_near_silent_buffer() {
+        // peak 0.001 ⇒ ideal gain 501; MAX_GAIN caps it at 8.0, so the
+        // resulting peak is 0.001 × 8.0 = 0.008, NOT TARGET_PEAK.
+        let input = vec![0.001_f32, -0.001, 0.001];
+        let out = normalize_for_playback(input);
+        let peak = out.iter().fold(0.0_f32, |a, &s| a.max(s.abs()));
+        assert!(
+            (peak - 0.008).abs() < 1e-6,
+            "expected peak 0.008 (gain-capped), got {peak}"
+        );
+        assert!(peak < TARGET_PEAK);
+    }
+
+    /// A pure-silence buffer is returned unchanged — no division by zero, no
+    /// amplification of nothing.
+    #[test]
+    fn normalize_for_playback_silence_is_unchanged() {
+        let input = vec![0.0_f32; 16];
+        let out = normalize_for_playback(input);
+        assert!(out.iter().all(|&s| s == 0.0));
+        assert_eq!(out.len(), 16);
+    }
+
+    /// Empty input round-trips to empty output. No panic on `peak == 0` path.
+    #[test]
+    fn normalize_for_playback_empty_buffer() {
+        let out = normalize_for_playback(Vec::new());
+        assert!(out.is_empty());
+    }
+
+    /// A buffer that would otherwise clip after gain is hard-clamped to ±1.0.
+    /// Belt-and-suspenders: `gain * peak` shouldn't exceed TARGET_PEAK by
+    /// construction, but the `.clamp(-1.0, 1.0)` is still asserted here in case
+    /// future changes loosen TARGET_PEAK or MAX_GAIN past full scale.
+    #[test]
+    fn normalize_for_playback_clamps_to_full_scale() {
+        let input = vec![10.0_f32, -10.0]; // already past full scale before gain
+        let out = normalize_for_playback(input);
+        assert!(out.iter().all(|&s| s.abs() <= 1.0));
+    }
 }
diff --git a/desktop/src-tauri/src/lib.rs b/desktop/src-tauri/src/lib.rs
index f0d024772..0443243a0 100644
--- a/desktop/src-tauri/src/lib.rs
+++ b/desktop/src-tauri/src/lib.rs
@@ -395,10 +395,10 @@ pub fn run() {
 
             // Pre-download voice models in the background so they're ready
             // when the user starts their first huddle. Idempotent — no-op if
-            // already downloaded. ~187 MB total (~100 MB Parakeet STT + ~87 MB Kokoro).
+            // already downloaded. ~289 MB total (~100 MB Parakeet STT + ~189 MB Pocket TTS).
             if let Some(mgr) = huddle::models::global_model_manager() {
                 mgr.start_stt_download(state.http_client.clone());
-                mgr.start_kokoro_download(state.http_client.clone());
+                mgr.start_tts_download(state.http_client.clone());
             }
 
             // Register PTT global shortcut (Ctrl+Space).
diff --git a/desktop/src/features/huddle/components/HuddleBar.tsx b/desktop/src/features/huddle/components/HuddleBar.tsx
index e335ea75f..1d0c6456a 100644
--- a/desktop/src/features/huddle/components/HuddleBar.tsx
+++ b/desktop/src/features/huddle/components/HuddleBar.tsx
@@ -66,7 +66,7 @@ export function HuddleBar({ className }: HuddleBarProps) {
   const [agentAddError, setAgentAddError] = React.useState<string | null>(null);
   const [modelStatus, setModelStatus] = React.useState<{
     stt: string;
-    kokoro: string;
+    tts: string;
   } | null>(null);
   // Huddle state: event-driven + 10s fallback poll.
   React.useEffect(() => {
@@ -133,13 +133,13 @@ export function HuddleBar({ className }: HuddleBarProps) {
       try {
         const status = await invoke<{
           stt: unknown;
-          kokoro: unknown;
+          tts: unknown;
         }>("get_model_status");
         if (cancelled) return;
 
         setModelStatus({
           stt: fmt(status.stt),
-          kokoro: fmt(status.kokoro),
+          tts: fmt(status.tts),
         });
       } catch {
         // best-effort
@@ -228,14 +228,14 @@ export function HuddleBar({ className }: HuddleBarProps) {
 
       {/* Model download progress */}
       {modelStatus &&
-        (modelStatus.stt !== "ready" || modelStatus.kokoro !== "ready") && (
+        (modelStatus.stt !== "ready" || modelStatus.tts !== "ready") && (
           <output className="flex items-center gap-1 text-xs text-muted-foreground">
             <span className="animate-pulse">
-              {modelStatus.stt !== "ready" && modelStatus.kokoro !== "ready"
-                ? `Voice models: STT ${modelStatus.stt}, TTS ${modelStatus.kokoro}`
+              {modelStatus.stt !== "ready" && modelStatus.tts !== "ready"
+                ? `Voice models: STT ${modelStatus.stt}, TTS ${modelStatus.tts}`
                 : modelStatus.stt !== "ready"
                   ? `STT model: ${modelStatus.stt}`
-                  : `TTS model: ${modelStatus.kokoro}`}
+                  : `TTS model: ${modelStatus.tts}`}
             </span>
           </output>
         )}
@@ -464,8 +464,8 @@ export function HuddleBar({ className }: HuddleBarProps) {
           modelStatus.stt !== "ready" &&
           `, STT model ${modelStatus.stt}`}
         {modelStatus &&
-          modelStatus.kokoro !== "ready" &&
-          `, TTS model ${modelStatus.kokoro}`}
+          modelStatus.tts !== "ready" &&
+          `, TTS model ${modelStatus.tts}`}
       </output>
     </div>
   );

From 26bb3cf270978ad573347d27d23b694a75687483 Mon Sep 17 00:00:00 2001
From: Tyler Longwell <tlongwell@squareup.com>
Date: Mon, 18 May 2026 09:52:50 -0400
Subject: [PATCH 02/10] huddle(tts): switch default reference voice to 'Mary'
 (VCTK p333)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the anonymous reference WAV bundled with KevinAHM's Pocket TTS
ONNX export with the 'Mary (f, conversation)' preset from the Kyutai
TTS demo (https://kyutai.org/tts), which maps to vctk/p333_023_enhanced.wav
in kyutai/tts-voices.

Source pin:
- repo: huggingface.co/kyutai/tts-voices
- commit: 323332d33f997de8394f24a193e1a76df720e01a
- path: vctk/p333_023_enhanced.wav
- format: 16-bit mono PCM, 32 kHz, 639,084 bytes
- sha256: a35b0468382218e9f37a9a7494d1e4b74deaf18d7ced22265b4e325bb55c183f
- license: CC-BY-4.0 (VCTK base + ai-coustics enhancement)

The on-disk filename remains 'reference_sample.wav' so engine and bench
code stay voice-agnostic. sherpa-onnx resamples internally via
reference_sample_rate, so the 16 kHz → 32 kHz source change is
transparent to the synthesis pipeline (only the load_voice_style doc
comment in pocket.rs needed updating).

Changes:
- models.rs: new POCKET_REFERENCE_WAV_URL pin, hash swap, TTS_MODEL_VERSION
  1→2 (so existing dev installs re-download cleanly without hitting the
  hash-fail-then-refetch transient), expanded TTS_LICENSE_TEXT block with
  VCTK + ai-coustics attribution per CC-BY-4.0 §3(a)(1).
- pocket.rs: module-doc attribution entry + load_voice_style doc comment
  reflect the new 32 kHz source and Mary's provenance.
- check-file-sizes.mjs: models.rs override 930 → 950 (attribution block
  added ~22 lines).

Verified:
- cargo test --lib (huddle::): 48/48 pass, including
  tts_readiness_requires_license_sidecar.
- cargo test --lib (full): 305/305 pass.
- cargo check --lib: clean.
- check-file-sizes.mjs: clean.
- Live URL fetch: SHA-256 of downloaded file matches TTS_FILE_HASHES entry.

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
---
 desktop/scripts/check-file-sizes.mjs   |  2 +-
 desktop/src-tauri/src/huddle/models.rs | 36 +++++++++++++++++++++-----
 desktop/src-tauri/src/huddle/pocket.rs |  8 ++++--
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index 1d20f3e4f..e05d5c7d5 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -67,7 +67,7 @@ const overrides = new Map([
   ["src/shared/api/types.ts", 620], // ... + RespondToMode + respondTo/respondToAllowlist on ManagedAgent/Create/Update inputs
   ["src-tauri/src/events.rs", 610], // event builders + build_huddle_guidelines (kind:48106) + post_event_raw transport helper + participant p-tag on join/leave + NIP-43 relay admin builders (add/remove/change-role) + check_relay_role + DM/presence/workflow command builders
   ["src-tauri/src/huddle/mod.rs", 1020], // huddle state machine + Tauri commands + sync protocol doc; state/relay/pipeline extracted + emit_huddle_state_changed wiring
-  ["src-tauri/src/huddle/models.rs", 930], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test
+  ["src-tauri/src/huddle/models.rs", 950], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test + Mary (VCTK p333) reference voice attribution block
   ["src-tauri/src/huddle/stt.rs", 580], // STT pipeline + PTT edge-detection flush + PTT gating (is_speech AND ptt_active) + barge-in for VAD mode + rubato resampler + earshot VAD + sherpa-onnx transcription
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
diff --git a/desktop/src-tauri/src/huddle/models.rs b/desktop/src-tauri/src/huddle/models.rs
index 8e562762c..888eebefa 100644
--- a/desktop/src-tauri/src/huddle/models.rs
+++ b/desktop/src-tauri/src/huddle/models.rs
@@ -45,10 +45,17 @@ const STT_ARCHIVE_SHA256: &str = "17f945007b52ccd8b7200ffc7c5652e9e8e961dfdf479c
 const POCKET_HF_BASE: &str =
     "https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26/resolve/e715955cf50d18d919d37231513c0e914b83661a";
 
-/// Reference voice WAV from KevinAHM's Pocket TTS ONNX export, pinned to
-/// commit 58a6d00cf13d239b6748cb0769f35c580a8f606c.
+/// Reference voice WAV: "Mary (f, conversation)" from the Kyutai TTS demo
+/// voice set — VCTK speaker p333, ai-coustics-enhanced. Pinned to
+/// kyutai/tts-voices commit 323332d33f997de8394f24a193e1a76df720e01a.
+///
+/// Mapping comes from the speaker dropdown on <https://kyutai.org/tts>:
+/// the Pocket TTS preset "Mary (f, conversation)" maps to
+/// `vctk/p333_023_enhanced.wav`. We rename to `reference_sample.wav` on disk
+/// so the rest of the engine code stays voice-agnostic; the friendly label
+/// only matters for attribution and PR-body docs.
 const POCKET_REFERENCE_WAV_URL: &str =
-    "https://huggingface.co/KevinAHM/pocket-tts-onnx/resolve/58a6d00cf13d239b6748cb0769f35c580a8f606c/reference_sample.wav";
+    "https://huggingface.co/kyutai/tts-voices/resolve/323332d33f997de8394f24a193e1a76df720e01a/vctk/p333_023_enhanced.wav";
 
 /// SHA-256 hashes for individual Pocket TTS model files.
 /// Computed from known-good pinned downloads. Update when upgrading model versions.
@@ -62,7 +69,7 @@ const TTS_FILE_HASHES: &[(&str, &str)] = &[
     ("vocab.json",            "6fb646346cf931016f70c4921aab0900ce7a304b893cb02135c74e294abfea01"),
     ("token_scores.json",     "5be2f278caf9b9800741f0fd82bff677f4943ec764c356f907213434b622d958"),
     ("LICENSE",               "fe7b4ce83b8381cc5b216bbb4af73c570688d1b819c73bbaed8ca401f4677cd6"),
-    ("reference_sample.wav",  "88fbb0d31ec26674e97e531a71758cabe4e0e4e5b5a18dafa783021a7f5c9366"),
+    ("reference_sample.wav",  "a35b0468382218e9f37a9a7494d1e4b74deaf18d7ced22265b4e325bb55c183f"),
 ];
 
 // ── Model versioning ──────────────────────────────────────────────────────────
@@ -79,7 +86,12 @@ const TTS_FILE_HASHES: &[(&str, &str)] = &[
 const STT_MODEL_VERSION: &str = "2";
 
 /// Model manifest version for Pocket TTS. Increment when upgrading model files.
-const TTS_MODEL_VERSION: &str = "1";
+/// Bumped "1" → "2" when the bundled reference voice changed from KevinAHM's
+/// anonymous 16 kHz sample to Mary (VCTK p333, 32 kHz, ai-coustics-enhanced)
+/// from kyutai/tts-voices. The hash mismatch on `reference_sample.wav` would
+/// fail readiness on its own, but the manifest bump makes the re-download
+/// reason explicit and skips the failing-then-re-fetching transient state.
+const TTS_MODEL_VERSION: &str = "2";
 
 /// Filename for the version manifest written alongside model files.
 const MANIFEST_FILENAME: &str = ".sprout-model-manifest";
@@ -146,7 +158,8 @@ const TTS_MODEL_DIR_NAME: &str = "pocket-tts";
 /// Attribution sidecar written next to the Pocket TTS model files.
 const TTS_LICENSE_FILE_NAME: &str = "MODEL_LICENSE.txt";
 
-/// CC-BY-4.0 §3(a)(1) attribution block for Pocket TTS and its ONNX packaging.
+/// CC-BY-4.0 §3(a)(1) attribution block for Pocket TTS, its ONNX packaging,
+/// and the bundled reference voice WAV.
 const TTS_LICENSE_TEXT: &str = "\
 Pocket TTS
 © Kyutai.
@@ -162,7 +175,16 @@ ONNX export by KevinAHM: https://huggingface.co/KevinAHM/pocket-tts-onnx
 Sherpa-onnx repackage by csukuangfj / k2-fsa:
 https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26
 
-Sprout ships the ONNX/model artifacts and reference_sample.wav unmodified,
+Bundled reference voice (reference_sample.wav):
+\"Mary (f, conversation)\" preset from the Kyutai TTS demo voice catalogue
+(https://kyutai.org/tts), distributed via
+https://huggingface.co/kyutai/tts-voices as `vctk/p333_023_enhanced.wav`.
+Original recording from the Voice Cloning Toolkit (VCTK) corpus, speaker p333:
+https://datashare.ed.ac.uk/handle/10283/3443 (CC-BY-4.0).
+Recording enhancement (denoise/dereverb) by ai-coustics:
+https://ai-coustics.com/
+
+Sprout ships all ONNX/model artifacts and the reference voice WAV unmodified,
 renamed only by placement in the local model directory.
 
 Provided \"AS IS\", without warranty of any kind, express or implied. See the
diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
index c90dd8719..12bf65f0a 100644
--- a/desktop/src-tauri/src/huddle/pocket.rs
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -13,11 +13,15 @@
 //! - **Mimi neural codec**: Kyutai, bundled in the same release. CC-BY-4.0.
 //! - **ONNX export**: KevinAHM —
 //!   <https://huggingface.co/KevinAHM/pocket-tts-onnx>. CC-BY-4.0.
-//!   Provides the reference voice WAV (`reference_sample.wav`).
 //! - **sherpa-onnx repackage**: csukuangfj / k2-fsa —
 //!   <https://huggingface.co/csukuangfj2/sherpa-onnx-pocket-tts-int8-2026-01-26>.
 //!   Repackages KevinAHM's export with the file layout sherpa-onnx's
 //!   `OfflineTtsPocketModelConfig` expects. CC-BY-4.0.
+//! - **Reference voice WAV** (`reference_sample.wav`): the "Mary
+//!   (f, conversation)" preset from the Kyutai TTS demo
+//!   (<https://kyutai.org/tts>), which maps to `vctk/p333_023_enhanced.wav`
+//!   in <https://huggingface.co/kyutai/tts-voices>. CC-BY-4.0, base recording
+//!   from the VCTK corpus, enhanced by ai-coustics.
 //!
 //! Sprout ships these files unmodified; see the on-disk `MODEL_LICENSE.txt`
 //! sidecar written by `huddle::models` during install for the canonical
@@ -109,7 +113,7 @@ pub struct VoiceStyle {
 ///
 /// Accepts any sample rate sherpa-onnx's `Wave::read` can decode — Pocket TTS
 /// resamples internally using `reference_sample_rate`. The bundled
-/// `reference_sample.wav` is 16 kHz mono.
+/// `reference_sample.wav` ("Mary" — VCTK p333, enhanced) is 32 kHz mono.
 pub fn load_voice_style(path: &Path) -> Result<VoiceStyle, String> {
     let path_str = path
         .to_str()

From 773a2a1ddd73e72d7e163dbf04692f1a72bf1dfb Mon Sep 17 00:00:00 2001
From: Tyler Longwell <tlongwell@squareup.com>
Date: Mon, 18 May 2026 10:25:38 -0400
Subject: [PATCH 03/10] huddle(tts): prep short Pocket TTS prompts to prevent
 runaway generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pocket TTS' autoregressive LM has a stochastic sampler (temp=0.7, random
seed) and a hard 500-frame ceiling (~40s of audio at 12.5Hz Mimi frame
rate). On very short, unpunctuated, or lowercase inputs, the EOS logit
(threshold > -4) sometimes never fires within those 500 frames, so the
model produces tens of seconds of nonsensical "breathing" output.

Tyler hit this on the first 'yep' utterance after the Mary swap landed:
~30s of monster noise, then subsequent utterances were fine. Non-
deterministic by design — the next sampled trajectory escaped the trap.

Sherpa-onnx's C++ Pocket TTS impl does not run the prompt preparation
that upstream kyutai-labs/pocket-tts applies in Python. This commit
mirrors that recipe locally:

  prepare_pocket_prompt():
    1. Collapse interior whitespace.
    2. Capitalize the first letter.
    3. Append '.' if no terminal punctuation.
    4. If <=4 words: prepend 8 spaces; bump frames_after_eos 1 -> 3.
    5. Compute adaptive max_frames from word count (saturates at 500).

  synth_chunk():
    - Routes input through prepare_pocket_prompt before calling
      generate_with_config.
    - Plumbs frames_after_eos + max_frames via GenerationConfig.extra
      (HashMap<String, serde_json::Value>) — exposed by the sherpa-onnx
      1.12 Rust binding as a per-call escape hatch.

Source: pocket_tts.models.tts_model.prepare_text_prompt and
_estimate_max_gen_len in https://github.com/kyutai-labs/pocket-tts.

12 new unit tests cover: empty input, the literal 'yep' case (must
become '        Yep.'), threshold inclusivity at 4 vs 5 words, preserved
existing punctuation, whitespace collapsing, no-double-capitalize,
non-ASCII first letter (Cyrillic д -> Д), max_frames tightness/clamping/
monotonicity.

Test results:
  cargo test --lib huddle::pocket  -> 12/12 pass
  cargo test --lib                 -> 317/317 pass (was 305 + 12 new)
  cargo fmt --check                -> clean
  cargo clippy huddle::pocket      -> clean (preexisting doc-list
                                            overindent warning at line
                                            41 not introduced here)
  node check-file-sizes.mjs        -> clean (pocket.rs override added)

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
---
 desktop/scripts/check-file-sizes.mjs   |   1 +
 desktop/src-tauri/src/huddle/pocket.rs | 296 ++++++++++++++++++++++++-
 2 files changed, 292 insertions(+), 5 deletions(-)

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index e05d5c7d5..bc1ebcc55 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -69,6 +69,7 @@ const overrides = new Map([
   ["src-tauri/src/huddle/mod.rs", 1020], // huddle state machine + Tauri commands + sync protocol doc; state/relay/pipeline extracted + emit_huddle_state_changed wiring
   ["src-tauri/src/huddle/models.rs", 950], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test + Mary (VCTK p333) reference voice attribution block
   ["src-tauri/src/huddle/stt.rs", 580], // STT pipeline + PTT edge-detection flush + PTT gating (is_speech AND ptt_active) + barge-in for VAD mode + rubato resampler + earshot VAD + sherpa-onnx transcription
+  ["src-tauri/src/huddle/pocket.rs", 560], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + estimate_max_frames (adaptive max_frames cap to prevent runaway "monster breathing" generation when EOS logit fails to fire) + 12 unit tests
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
   ["src-tauri/src/huddle/tts.rs", 1130], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fades + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 24 unit tests (18 interrupt + 6 fade/normalize)
diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
index 12bf65f0a..fe0dcc743 100644
--- a/desktop/src-tauri/src/huddle/pocket.rs
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -44,6 +44,7 @@
 //! Kokoro engine but are unused — Pocket TTS does its own language ID from
 //! the input text and is not a diffusion model (consistency LM, one step).
 
+use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 
 use sherpa_onnx::{GenerationConfig, OfflineTts, OfflineTtsConfig, Wave};
@@ -85,6 +86,35 @@ const SYNTH_NUM_STEPS: i32 = 1;
 /// helping of leading silence on every utterance.
 const SYNTH_SILENCE_SCALE: f32 = 0.0;
 
+/// Mimi codec frame rate — the LM samples one latent per 80 ms. Used to convert
+/// a token-count estimate into a `max_frames` cap, mirroring upstream
+/// `pocket_tts.models.tts_model._estimate_max_gen_len`.
+const MIMI_FRAME_RATE: f32 = 12.5;
+
+/// Upstream-derived "expected tokens per second of speech" for short inputs.
+/// Used by [`estimate_max_frames`] together with `GEN_SECONDS_PADDING` to cap
+/// runaway generation when the EOS logit fails to fire. Source:
+/// `pocket_tts.models.tts_model.TTSModel._TOKENS_PER_SECOND_ESTIMATE`.
+const TOKENS_PER_SECOND_ESTIMATE: f32 = 3.0;
+
+/// Slack added to the token-derived gen-length estimate, in seconds. Source:
+/// `pocket_tts.models.tts_model.TTSModel._GEN_SECONDS_PADDING`.
+const GEN_SECONDS_PADDING: f32 = 2.0;
+
+/// Hard ceiling on per-chunk generation length, in Mimi frames. Matches the
+/// sherpa-onnx upstream default (`offline-tts-pocket-impl.h:max_frames`) and
+/// is the worst-case bound we'll ever ask for. 500 frames = 40 s of audio.
+const MAX_FRAMES_HARD_CEILING: i32 = 500;
+
+/// Word-count threshold (inclusive) below which we (a) pad the prompt with
+/// leading spaces and (b) ask for `frames_after_eos = 3` instead of 1.
+/// Matches upstream `pocket_tts.models.tts_model.prepare_text_prompt`.
+const SHORT_PROMPT_WORD_THRESHOLD: usize = 4;
+
+/// Number of leading spaces prepended to short prompts. The upstream Python
+/// uses exactly 8 — keep parity rather than tuning blindly.
+const SHORT_PROMPT_PAD_SPACES: usize = 8;
+
 // ── ONNX file names (five Pocket TTS sessions plus two JSON tables) ───────────
 
 const FILE_LM_MAIN: &str = "lm_main.int8.onnx";
@@ -187,6 +217,122 @@ pub fn load_text_to_speech(model_dir: &str) -> Result<PocketTts, String> {
     Ok(PocketTts { inner })
 }
 
+// ── Prompt preparation ────────────────────────────────────────────────────────
+
+/// Result of [`prepare_pocket_prompt`]: a synthesizer-ready prompt plus the
+/// per-call generation hints derived from the original text.
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) struct PreparedPrompt {
+    /// Text to hand to `OfflineTts::generate_with_config`. Capitalized,
+    /// punctuation-terminated, and (for short inputs) left-padded with spaces.
+    pub text: String,
+    /// Value to pass via `GenerationConfig.extra["frames_after_eos"]`.
+    pub frames_after_eos: i32,
+    /// Value to pass via `GenerationConfig.extra["max_frames"]`. Adaptive to
+    /// text length — short prompts get a much tighter cap to prevent runaway
+    /// "monster breathing" generation when the EOS logit fails to fire.
+    pub max_frames: i32,
+}
+
+/// Mirror of upstream `pocket_tts.models.tts_model.prepare_text_prompt` plus
+/// `_estimate_max_gen_len`. Sherpa-onnx's C++ Pocket TTS impl does not run
+/// these preparation steps, so short / unpunctuated / lowercase inputs can
+/// trigger up to 40 s of runaway generation when the EOS logit never crosses
+/// its threshold. We replicate the upstream Python recipe here:
+///
+/// 1. Collapse interior whitespace (already done by `preprocess_for_tts`, but
+///    cheap to re-check after sentence splitting).
+/// 2. Capitalize the first letter.
+/// 3. Append `.` if the text doesn't end in punctuation.
+/// 4. If fewer than five words, prepend `SHORT_PROMPT_PAD_SPACES` spaces and
+///    bump `frames_after_eos` from 1 → 3.
+/// 5. Compute an adaptive `max_frames` from the (post-padding) word count.
+///
+/// Returns `None` only if the input is empty after trimming — caller should
+/// skip synthesis in that case.
+pub(crate) fn prepare_pocket_prompt(input: &str) -> Option<PreparedPrompt> {
+    let trimmed = input.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+
+    // Collapse stray double-spaces / embedded newlines that may slip past
+    // `preprocess_for_tts` when sentences are spliced back together.
+    let mut cleaned = String::with_capacity(trimmed.len());
+    let mut last_was_space = false;
+    for ch in trimmed.chars() {
+        let is_ws = ch.is_whitespace();
+        if is_ws {
+            if !last_was_space {
+                cleaned.push(' ');
+            }
+            last_was_space = true;
+        } else {
+            cleaned.push(ch);
+            last_was_space = false;
+        }
+    }
+
+    // Capitalize first character. Uses `to_uppercase` (multi-codepoint safe).
+    let first = cleaned.chars().next().expect("cleaned non-empty above");
+    if first.is_lowercase() {
+        let upper: String = first.to_uppercase().collect();
+        let mut iter = cleaned.chars();
+        iter.next();
+        cleaned = upper + iter.as_str();
+    }
+
+    // Ensure terminal punctuation. Anything not in `.!?;:,` gets a period.
+    // The upstream Python only checks `isalnum` → period, but for our agent
+    // text we already may end in `!` `?` `.` etc. — treat any of those as OK.
+    let last = cleaned
+        .chars()
+        .next_back()
+        .expect("cleaned non-empty above");
+    if !matches!(last, '.' | '!' | '?' | ';' | ':' | ',') {
+        cleaned.push('.');
+    }
+
+    // Word count of the *cleaned but not padded* text — padding is whitespace
+    // only and would just lie to the threshold check below.
+    let word_count = cleaned.split_whitespace().count();
+    let is_short = word_count <= SHORT_PROMPT_WORD_THRESHOLD;
+
+    let final_text = if is_short {
+        let mut padded = String::with_capacity(cleaned.len() + SHORT_PROMPT_PAD_SPACES);
+        for _ in 0..SHORT_PROMPT_PAD_SPACES {
+            padded.push(' ');
+        }
+        padded.push_str(&cleaned);
+        padded
+    } else {
+        cleaned
+    };
+
+    let frames_after_eos = if is_short { 3 } else { 1 };
+    let max_frames = estimate_max_frames(word_count);
+
+    Some(PreparedPrompt {
+        text: final_text,
+        frames_after_eos,
+        max_frames,
+    })
+}
+
+/// Convert a word count into a Mimi-frame cap, matching upstream
+/// `_estimate_max_gen_len`. We use words as a sentencepiece-token proxy: real
+/// SP tokenization runs ~1.2–1.5 tokens/word for English, which the
+/// `GEN_SECONDS_PADDING` slack absorbs. Saturates at
+/// `MAX_FRAMES_HARD_CEILING` so we never *raise* the upstream default.
+fn estimate_max_frames(word_count: usize) -> i32 {
+    // Treat each word as ~1.3 tokens — within the slack envelope but a touch
+    // generous so we don't truncate genuine short utterances.
+    let approx_tokens = word_count as f32 * 1.3;
+    let gen_len_sec = approx_tokens / TOKENS_PER_SECOND_ESTIMATE + GEN_SECONDS_PADDING;
+    let frames = (gen_len_sec * MIMI_FRAME_RATE).ceil() as i32;
+    frames.clamp(1, MAX_FRAMES_HARD_CEILING)
+}
+
 impl PocketTts {
     /// Synthesise `text` with the given reference voice.
     ///
@@ -202,9 +348,29 @@ impl PocketTts {
         _steps: usize,
         speed: f32,
     ) -> Result<Vec<f32>, String> {
-        if text.trim().is_empty() {
-            return Ok(Vec::new());
-        }
+        // Mirror upstream pocket-tts prompt prep — without this short or
+        // unpunctuated inputs can cause the LM's EOS logit to never trip,
+        // producing up to 40 s of "monster breathing" garbage on the first
+        // utterance. See `prepare_pocket_prompt` for the full recipe.
+        let prepared = match prepare_pocket_prompt(text) {
+            Some(p) => p,
+            None => return Ok(Vec::new()),
+        };
+
+        // Per-call generation hints sherpa-onnx forwards to
+        // `offline-tts-pocket-impl.h`. `frames_after_eos` is bumped for short
+        // prompts to give the model trailing room to gracefully decay; the
+        // adaptive `max_frames` is the safety net that bounds runaway
+        // generation when EOS never fires.
+        let mut extra: HashMap<String, serde_json::Value> = HashMap::with_capacity(2);
+        extra.insert(
+            "frames_after_eos".to_string(),
+            serde_json::Value::from(prepared.frames_after_eos),
+        );
+        extra.insert(
+            "max_frames".to_string(),
+            serde_json::Value::from(prepared.max_frames),
+        );
 
         let cfg = GenerationConfig {
             speed,
@@ -212,6 +378,7 @@ impl PocketTts {
             silence_scale: SYNTH_SILENCE_SCALE,
             reference_audio: Some(style.samples.clone()),
             reference_sample_rate: style.sample_rate,
+            extra: Some(extra),
             ..Default::default()
         };
 
@@ -221,11 +388,11 @@ impl PocketTts {
         // `generate_with_config` generic parameter.
         let audio = self
             .inner
-            .generate_with_config(text, &cfg, None::<fn(&[f32], f32) -> bool>)
+            .generate_with_config(&prepared.text, &cfg, None::<fn(&[f32], f32) -> bool>)
             .ok_or_else(|| {
                 format!(
                     "Pocket TTS synthesis failed for text ({} chars)",
-                    text.len()
+                    prepared.text.len()
                 )
             })?;
 
@@ -240,3 +407,122 @@ impl PocketTts {
         Ok(audio.samples().to_vec())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ── prepare_pocket_prompt ────────────────────────────────────────────────
+
+    #[test]
+    fn prepare_prompt_returns_none_for_empty_input() {
+        assert!(prepare_pocket_prompt("").is_none());
+        assert!(prepare_pocket_prompt("   ").is_none());
+        assert!(prepare_pocket_prompt("\n\t  ").is_none());
+    }
+
+    #[test]
+    fn prepare_prompt_pads_and_capitalizes_one_word() {
+        // The "yep" case Tyler hit in production — bare lowercase one-word
+        // utterance with no punctuation. Must be padded, capitalized, and
+        // terminated.
+        let out = prepare_pocket_prompt("yep").expect("non-empty");
+        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        assert_eq!(out.text, format!("{pad}Yep."));
+        assert_eq!(out.frames_after_eos, 3);
+        // 1 word → very low frame cap (well under the 500 hard ceiling).
+        assert!(out.max_frames < MAX_FRAMES_HARD_CEILING);
+    }
+
+    #[test]
+    fn prepare_prompt_preserves_existing_punctuation() {
+        let out = prepare_pocket_prompt("yes!").expect("non-empty");
+        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        assert_eq!(out.text, format!("{pad}Yes!")); // exclamation kept
+        let out = prepare_pocket_prompt("really?").expect("non-empty");
+        assert_eq!(out.text, format!("{pad}Really?"));
+    }
+
+    #[test]
+    fn prepare_prompt_threshold_is_inclusive_at_four_words() {
+        // 4 words = short (padded); 5 words = long (not padded).
+        let four = prepare_pocket_prompt("one two three four").expect("non-empty");
+        assert!(
+            four.text.starts_with(' '),
+            "four-word input should be padded"
+        );
+        assert_eq!(four.frames_after_eos, 3);
+
+        let five = prepare_pocket_prompt("one two three four five").expect("non-empty");
+        assert!(
+            !five.text.starts_with(' '),
+            "five-word input should NOT be padded"
+        );
+        assert_eq!(five.frames_after_eos, 1);
+    }
+
+    #[test]
+    fn prepare_prompt_does_not_pad_long_text() {
+        let long = "This is a longer sentence that the model should handle just fine.";
+        let out = prepare_pocket_prompt(long).expect("non-empty");
+        assert!(!out.text.starts_with(' '));
+        assert_eq!(out.frames_after_eos, 1);
+        assert!(out.text.ends_with('.'));
+    }
+
+    #[test]
+    fn prepare_prompt_collapses_whitespace() {
+        let out = prepare_pocket_prompt("Hello    world\n\nfriend").expect("non-empty");
+        // No padding (3 words → short → padded), but interior is collapsed.
+        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        assert_eq!(out.text, format!("{pad}Hello world friend."));
+    }
+
+    #[test]
+    fn prepare_prompt_does_not_double_capitalize_already_uppercase() {
+        let out = prepare_pocket_prompt("HELLO there").expect("non-empty");
+        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        assert_eq!(out.text, format!("{pad}HELLO there."));
+    }
+
+    #[test]
+    fn prepare_prompt_handles_non_ascii_first_letter() {
+        // Cyrillic lowercase 'д' → uppercase 'Д'. Must not panic / produce
+        // mojibake.
+        let out = prepare_pocket_prompt("дa").expect("non-empty");
+        assert!(out.text.contains("Дa."));
+    }
+
+    // ── estimate_max_frames ──────────────────────────────────────────────────
+
+    #[test]
+    fn estimate_max_frames_is_tight_for_short_input() {
+        // 1 word: 1 * 1.3 / 3.0 + 2.0 ≈ 2.43s ≈ 31 frames. Well below 500.
+        let frames = estimate_max_frames(1);
+        assert!(frames > 0);
+        assert!(frames < 50, "got {frames}");
+    }
+
+    #[test]
+    fn estimate_max_frames_saturates_at_ceiling() {
+        // 5000 words ≈ a runaway prompt; must clamp at the hard ceiling.
+        assert_eq!(estimate_max_frames(5_000), MAX_FRAMES_HARD_CEILING);
+    }
+
+    #[test]
+    fn estimate_max_frames_grows_with_word_count() {
+        let small = estimate_max_frames(2);
+        let medium = estimate_max_frames(20);
+        let large = estimate_max_frames(100);
+        assert!(small < medium);
+        assert!(medium < large);
+        assert!(large <= MAX_FRAMES_HARD_CEILING);
+    }
+
+    #[test]
+    fn estimate_max_frames_never_zero() {
+        // Sanity: even a 0-word prompt yields ≥1 frame so we never ask the
+        // engine for an impossible cap.
+        assert!(estimate_max_frames(0) >= 1);
+    }
+}

From 1dbfa2c66e8364e48e39efc56f50ec3f6f8ee2dc Mon Sep 17 00:00:00 2001
From: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6
 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
Date: Mon, 18 May 2026 11:10:35 -0400
Subject: [PATCH 04/10] huddle(tts): stop overriding sherpa-onnx
 frames_after_eos on long prompts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The earlier prep-prompt fix (commit 773a2a1) bounded the runaway 'monster
breathing' bug on short inputs, but did so by forcing
`frames_after_eos = 1` on every prompt with ≥5 words. That's *lower*
than the sherpa-onnx upstream default of 3, and it clipped the leading
audio of multi-clause sentences:

  Input:  'Yep, I can hear you. What can I help with?'
  Output: a quick burst of static where 'Yep, I can hear you.' should be.

The sentence splitter emits 'Yep, I can hear you.' (5 words, comma not a
boundary) as the first chunk, which under 773a2a1 hit the long-input
branch and got frames_after_eos=1 — too few trailing LM frames after EOS
fires for the codec to settle, hence the static-burst.

Re-reading offline-tts-pocket-impl.h confirmed two things:

  1. The upstream default for frames_after_eos is **3**, not 1. My
     773a2a1 docs had this backwards.
  2. The 500-frame max_frames default (~40s) is fine for any reasonable
     prompt. We only need a tighter cap on the short-input path where
     the EOS-never-fires runaway bug originally manifested.

This commit:

  - Removes the per-call frames_after_eos override entirely. sherpa-onnx
    keeps its default of 3 for every prompt, including short ones (which
    upstream pocket_tts.py also bumps to 3 — so the default is right).
  - Keeps the max_frames override, but only on short (≤4-word) padded
    inputs, set to a generous 100 frames (~8s) — enough slack to never
    truncate a legitimate short reply.
  - Removes estimate_max_frames() (no callers; the new short cap is a
    fixed constant, not adaptive).
  - Refactors the extra-HashMap builder into build_generation_extra()
    so we can structurally regression-test it.
  - Adds property test build_extra_never_lowers_frames_after_eos_for_any_word_count
    that sweeps a range of prompt lengths and fails CI if anyone ever
    reintroduces a frames_after_eos override below the upstream default.
  - Adds build_extra_long_prompt_is_none which pins down the specific
    'Yep, I can hear you.' regression from this report.

Tests: cargo test --lib huddle::pocket → 12/12 pass.

Refs commit 773a2a1 (initial prep-prompt fix).
Bumps pocket.rs line-size override 560 → 620 (refactored helper + new
tests).

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
Signed-off-by: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
---
 desktop/scripts/check-file-sizes.mjs   |   2 +-
 desktop/src-tauri/src/huddle/pocket.rs | 265 ++++++++++++++++---------
 2 files changed, 168 insertions(+), 99 deletions(-)

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index bc1ebcc55..8a514ae2b 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -69,7 +69,7 @@ const overrides = new Map([
   ["src-tauri/src/huddle/mod.rs", 1020], // huddle state machine + Tauri commands + sync protocol doc; state/relay/pipeline extracted + emit_huddle_state_changed wiring
   ["src-tauri/src/huddle/models.rs", 950], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test + Mary (VCTK p333) reference voice attribution block
   ["src-tauri/src/huddle/stt.rs", 580], // STT pipeline + PTT edge-detection flush + PTT gating (is_speech AND ptt_active) + barge-in for VAD mode + rubato resampler + earshot VAD + sherpa-onnx transcription
-  ["src-tauri/src/huddle/pocket.rs", 560], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + estimate_max_frames (adaptive max_frames cap to prevent runaway "monster breathing" generation when EOS logit fails to fire) + 12 unit tests
+  ["src-tauri/src/huddle/pocket.rs", 620], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + build_generation_extra (only overrides max_frames, and only for ≤4-word inputs, to bound runaway "monster breathing" generation without clipping multi-clause sentences) + 12 unit tests including regressions for the static-burst-on-multi-clause-sentence bug
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
   ["src-tauri/src/huddle/tts.rs", 1130], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fades + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 24 unit tests (18 interrupt + 6 fade/normalize)
diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
index fe0dcc743..73f004beb 100644
--- a/desktop/src-tauri/src/huddle/pocket.rs
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -86,35 +86,43 @@ const SYNTH_NUM_STEPS: i32 = 1;
 /// helping of leading silence on every utterance.
 const SYNTH_SILENCE_SCALE: f32 = 0.0;
 
-/// Mimi codec frame rate — the LM samples one latent per 80 ms. Used to convert
-/// a token-count estimate into a `max_frames` cap, mirroring upstream
-/// `pocket_tts.models.tts_model._estimate_max_gen_len`.
-const MIMI_FRAME_RATE: f32 = 12.5;
-
-/// Upstream-derived "expected tokens per second of speech" for short inputs.
-/// Used by [`estimate_max_frames`] together with `GEN_SECONDS_PADDING` to cap
-/// runaway generation when the EOS logit fails to fire. Source:
-/// `pocket_tts.models.tts_model.TTSModel._TOKENS_PER_SECOND_ESTIMATE`.
-const TOKENS_PER_SECOND_ESTIMATE: f32 = 3.0;
-
-/// Slack added to the token-derived gen-length estimate, in seconds. Source:
-/// `pocket_tts.models.tts_model.TTSModel._GEN_SECONDS_PADDING`.
-const GEN_SECONDS_PADDING: f32 = 2.0;
-
-/// Hard ceiling on per-chunk generation length, in Mimi frames. Matches the
-/// sherpa-onnx upstream default (`offline-tts-pocket-impl.h:max_frames`) and
-/// is the worst-case bound we'll ever ask for. 500 frames = 40 s of audio.
-const MAX_FRAMES_HARD_CEILING: i32 = 500;
-
-/// Word-count threshold (inclusive) below which we (a) pad the prompt with
-/// leading spaces and (b) ask for `frames_after_eos = 3` instead of 1.
-/// Matches upstream `pocket_tts.models.tts_model.prepare_text_prompt`.
+/// sherpa-onnx upstream default for `max_frames` (LM steps), in
+/// `offline-tts-pocket-impl.h:Generate`. 500 steps ≈ 40 s of audio at the
+/// Mimi 12.5 Hz frame rate. Referenced only by the regression test below;
+/// production code path never raises (or even reads) this value — we just
+/// leave sherpa-onnx's own default in place by not setting the override.
+#[cfg(test)]
+const SHERPA_ONNX_MAX_FRAMES_DEFAULT: i32 = 500;
+
+/// Tight `max_frames` we ask for on short, padded prompts to bound the
+/// original "monster breathing" runaway. 100 LM steps ≈ 8 s of audio —
+/// roomy for any one-to-four-word utterance the user is likely to elicit
+/// while still well short of the 40 s upstream default. Chosen with slack so
+/// we never *truncate* a legitimate short reply.
+const SHORT_PROMPT_MAX_FRAMES: i32 = 100;
+
+/// Word-count threshold (inclusive) below which we pad the prompt with
+/// leading spaces and cap `max_frames` tighter than the upstream default.
+/// Matches upstream `pocket_tts.models.tts_model.prepare_text_prompt`. Above
+/// this threshold we leave sherpa-onnx's own defaults in place — overriding
+/// them caused the "first 'yep' is just static" regression seen on
+/// 2026-05-18, where dropping `frames_after_eos` below the upstream default
+/// of 3 clipped the leading audio of multi-clause sentences.
 const SHORT_PROMPT_WORD_THRESHOLD: usize = 4;
 
 /// Number of leading spaces prepended to short prompts. The upstream Python
 /// uses exactly 8 — keep parity rather than tuning blindly.
 const SHORT_PROMPT_PAD_SPACES: usize = 8;
 
+/// sherpa-onnx's documented `frames_after_eos` default. We deliberately do
+/// *not* override this knob — the previous attempt to bump it for short
+/// inputs and lower it for long inputs lowered it below the upstream default
+/// of 3, which clipped the leading audio of multi-clause sentences (the
+/// "first 'yep' is static" regression). The constant exists only for the
+/// regression test below. Source: `offline-tts-pocket-impl.h:Generate`.
+#[cfg(test)]
+const SHERPA_ONNX_FRAMES_AFTER_EOS_DEFAULT: i32 = 3;
+
 // ── ONNX file names (five Pocket TTS sessions plus two JSON tables) ───────────
 
 const FILE_LM_MAIN: &str = "lm_main.int8.onnx";
@@ -220,33 +228,43 @@ pub fn load_text_to_speech(model_dir: &str) -> Result<PocketTts, String> {
 // ── Prompt preparation ────────────────────────────────────────────────────────
 
 /// Result of [`prepare_pocket_prompt`]: a synthesizer-ready prompt plus the
-/// per-call generation hints derived from the original text.
+/// per-call generation overrides derived from the original text.
+///
+/// `None` for either override means "leave sherpa-onnx's documented default
+/// in place". The pipeline only sets `max_frames` (and only for short
+/// padded inputs) so it can bound the original "monster breathing" runaway
+/// without disturbing the rest of the LM sampling envelope.
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) struct PreparedPrompt {
     /// Text to hand to `OfflineTts::generate_with_config`. Capitalized,
     /// punctuation-terminated, and (for short inputs) left-padded with spaces.
     pub text: String,
-    /// Value to pass via `GenerationConfig.extra["frames_after_eos"]`.
-    pub frames_after_eos: i32,
-    /// Value to pass via `GenerationConfig.extra["max_frames"]`. Adaptive to
-    /// text length — short prompts get a much tighter cap to prevent runaway
-    /// "monster breathing" generation when the EOS logit fails to fire.
-    pub max_frames: i32,
+    /// Value to pass via `GenerationConfig.extra["max_frames"]`, or `None` to
+    /// keep the upstream default of 500 LM steps. We only override on short
+    /// padded prompts where we have a tight expectation on output length.
+    pub max_frames: Option<i32>,
 }
 
-/// Mirror of upstream `pocket_tts.models.tts_model.prepare_text_prompt` plus
-/// `_estimate_max_gen_len`. Sherpa-onnx's C++ Pocket TTS impl does not run
-/// these preparation steps, so short / unpunctuated / lowercase inputs can
-/// trigger up to 40 s of runaway generation when the EOS logit never crosses
-/// its threshold. We replicate the upstream Python recipe here:
+/// Mirror of the *text-preparation* half of upstream
+/// `pocket_tts.models.tts_model.prepare_text_prompt`. Sherpa-onnx's C++
+/// Pocket TTS impl does not run these preparation steps, so short /
+/// unpunctuated / lowercase inputs can trigger up to 40 s of runaway
+/// generation when the EOS logit never crosses its threshold. We replicate
+/// the upstream Python recipe here:
 ///
 /// 1. Collapse interior whitespace (already done by `preprocess_for_tts`, but
 ///    cheap to re-check after sentence splitting).
 /// 2. Capitalize the first letter.
 /// 3. Append `.` if the text doesn't end in punctuation.
 /// 4. If fewer than five words, prepend `SHORT_PROMPT_PAD_SPACES` spaces and
-///    bump `frames_after_eos` from 1 → 3.
-/// 5. Compute an adaptive `max_frames` from the (post-padding) word count.
+///    return a tight [`SHORT_PROMPT_MAX_FRAMES`] cap so the LM can't run
+///    away if EOS still doesn't fire.
+///
+/// We do **not** override `frames_after_eos` — sherpa-onnx's default of 3
+/// is what we want. An earlier version set it to 1 on long inputs, which
+/// clipped the leading audio of multi-clause sentences ("first 'yep' is
+/// just static" regression). Tests `prepare_prompt_never_lowers_frames_…`
+/// lock this in.
 ///
 /// Returns `None` only if the input is empty after trimming — caller should
 /// skip synthesis in that case.
@@ -298,39 +316,41 @@ pub(crate) fn prepare_pocket_prompt(input: &str) -> Option<PreparedPrompt> {
     let word_count = cleaned.split_whitespace().count();
     let is_short = word_count <= SHORT_PROMPT_WORD_THRESHOLD;
 
-    let final_text = if is_short {
+    let (final_text, max_frames) = if is_short {
         let mut padded = String::with_capacity(cleaned.len() + SHORT_PROMPT_PAD_SPACES);
         for _ in 0..SHORT_PROMPT_PAD_SPACES {
             padded.push(' ');
         }
         padded.push_str(&cleaned);
-        padded
+        (padded, Some(SHORT_PROMPT_MAX_FRAMES))
     } else {
-        cleaned
+        // For everything ≥5 words, fall back to upstream defaults. Overriding
+        // these is what caused the "first 'yep' is static" regression — the
+        // upstream LM has been tuned for `frames_after_eos = 3` and
+        // `max_frames = 500`, and there's no clear win in second-guessing.
+        (cleaned, None)
     };
 
-    let frames_after_eos = if is_short { 3 } else { 1 };
-    let max_frames = estimate_max_frames(word_count);
-
     Some(PreparedPrompt {
         text: final_text,
-        frames_after_eos,
         max_frames,
     })
 }
 
-/// Convert a word count into a Mimi-frame cap, matching upstream
-/// `_estimate_max_gen_len`. We use words as a sentencepiece-token proxy: real
-/// SP tokenization runs ~1.2–1.5 tokens/word for English, which the
-/// `GEN_SECONDS_PADDING` slack absorbs. Saturates at
-/// `MAX_FRAMES_HARD_CEILING` so we never *raise* the upstream default.
-fn estimate_max_frames(word_count: usize) -> i32 {
-    // Treat each word as ~1.3 tokens — within the slack envelope but a touch
-    // generous so we don't truncate genuine short utterances.
-    let approx_tokens = word_count as f32 * 1.3;
-    let gen_len_sec = approx_tokens / TOKENS_PER_SECOND_ESTIMATE + GEN_SECONDS_PADDING;
-    let frames = (gen_len_sec * MIMI_FRAME_RATE).ceil() as i32;
-    frames.clamp(1, MAX_FRAMES_HARD_CEILING)
+/// Build the `GenerationConfig.extra` HashMap from a [`PreparedPrompt`].
+///
+/// Centralised so the regression test below can assert that we **never**
+/// emit a `frames_after_eos` override — the previous attempt to override
+/// that knob (setting it to 1 for ≥5-word inputs) clipped the leading
+/// audio of multi-clause sentences (the "first 'yep' is static" bug on
+/// 2026-05-18). The upstream sherpa-onnx default of 3 is what we want, and
+/// the right way to keep it is to not set it at all.
+fn build_generation_extra(prepared: &PreparedPrompt) -> Option<HashMap<String, serde_json::Value>> {
+    prepared.max_frames.map(|mf| {
+        let mut h: HashMap<String, serde_json::Value> = HashMap::with_capacity(1);
+        h.insert("max_frames".to_string(), serde_json::Value::from(mf));
+        h
+    })
 }
 
 impl PocketTts {
@@ -358,19 +378,12 @@ impl PocketTts {
         };
 
         // Per-call generation hints sherpa-onnx forwards to
-        // `offline-tts-pocket-impl.h`. `frames_after_eos` is bumped for short
-        // prompts to give the model trailing room to gracefully decay; the
-        // adaptive `max_frames` is the safety net that bounds runaway
-        // generation when EOS never fires.
-        let mut extra: HashMap<String, serde_json::Value> = HashMap::with_capacity(2);
-        extra.insert(
-            "frames_after_eos".to_string(),
-            serde_json::Value::from(prepared.frames_after_eos),
-        );
-        extra.insert(
-            "max_frames".to_string(),
-            serde_json::Value::from(prepared.max_frames),
-        );
+        // `offline-tts-pocket-impl.h`. We only override `max_frames`, and
+        // only for short padded prompts where we have a tight expectation
+        // on output length — that bounds the original runaway without
+        // disturbing the rest of the LM sampling envelope. See
+        // `prepare_pocket_prompt` docs for the regression history.
+        let extra = build_generation_extra(&prepared);
 
         let cfg = GenerationConfig {
             speed,
@@ -378,7 +391,7 @@ impl PocketTts {
             silence_scale: SYNTH_SILENCE_SCALE,
             reference_audio: Some(style.samples.clone()),
             reference_sample_rate: style.sample_rate,
-            extra: Some(extra),
+            extra,
             ..Default::default()
         };
 
@@ -425,13 +438,15 @@ mod tests {
     fn prepare_prompt_pads_and_capitalizes_one_word() {
         // The "yep" case Tyler hit in production — bare lowercase one-word
         // utterance with no punctuation. Must be padded, capitalized, and
-        // terminated.
+        // terminated, with a tight `max_frames` cap to bound runaway gen.
         let out = prepare_pocket_prompt("yep").expect("non-empty");
         let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
         assert_eq!(out.text, format!("{pad}Yep."));
-        assert_eq!(out.frames_after_eos, 3);
-        // 1 word → very low frame cap (well under the 500 hard ceiling).
-        assert!(out.max_frames < MAX_FRAMES_HARD_CEILING);
+        assert_eq!(out.max_frames, Some(SHORT_PROMPT_MAX_FRAMES));
+        assert!(
+            SHORT_PROMPT_MAX_FRAMES < SHERPA_ONNX_MAX_FRAMES_DEFAULT,
+            "short cap must be tighter than the upstream default"
+        );
     }
 
     #[test]
@@ -445,20 +460,24 @@ mod tests {
 
     #[test]
     fn prepare_prompt_threshold_is_inclusive_at_four_words() {
-        // 4 words = short (padded); 5 words = long (not padded).
+        // 4 words = short (padded + tight max_frames); 5 words = long
+        // (no padding, no overrides — upstream defaults stand).
         let four = prepare_pocket_prompt("one two three four").expect("non-empty");
         assert!(
             four.text.starts_with(' '),
             "four-word input should be padded"
         );
-        assert_eq!(four.frames_after_eos, 3);
+        assert_eq!(four.max_frames, Some(SHORT_PROMPT_MAX_FRAMES));
 
         let five = prepare_pocket_prompt("one two three four five").expect("non-empty");
         assert!(
             !five.text.starts_with(' '),
             "five-word input should NOT be padded"
         );
-        assert_eq!(five.frames_after_eos, 1);
+        assert_eq!(
+            five.max_frames, None,
+            "long inputs must leave sherpa-onnx's max_frames default in place"
+        );
     }
 
     #[test]
@@ -466,7 +485,7 @@ mod tests {
         let long = "This is a longer sentence that the model should handle just fine.";
         let out = prepare_pocket_prompt(long).expect("non-empty");
         assert!(!out.text.starts_with(' '));
-        assert_eq!(out.frames_after_eos, 1);
+        assert_eq!(out.max_frames, None);
         assert!(out.text.ends_with('.'));
     }
 
@@ -493,36 +512,86 @@ mod tests {
         assert!(out.text.contains("Дa."));
     }
 
-    // ── estimate_max_frames ──────────────────────────────────────────────────
+    // ── build_generation_extra ───────────────────────────────────────────────
+    //
+    // These tests pin down a behaviour we've now regressed twice on:
+    //   1) Not padding/punctuating short inputs → 40 s of "monster breathing"
+    //      (pre-773a2a1).
+    //   2) Setting `frames_after_eos = 1` on long inputs → clipped leading
+    //      audio of multi-clause sentences, e.g. "Yep, I can hear you. …"
+    //      came out as a static burst (the 773a2a1 regression Tyler hit on
+    //      2026-05-18 ~14:30 UTC).
+    //
+    // The contract we enforce going forward: we **only** override
+    // `max_frames`, and only for ≤4-word inputs. Every other knob is left
+    // at sherpa-onnx's documented default (notably `frames_after_eos = 3`).
 
     #[test]
-    fn estimate_max_frames_is_tight_for_short_input() {
-        // 1 word: 1 * 1.3 / 3.0 + 2.0 ≈ 2.43s ≈ 31 frames. Well below 500.
-        let frames = estimate_max_frames(1);
-        assert!(frames > 0);
-        assert!(frames < 50, "got {frames}");
+    fn build_extra_short_prompt_sets_only_max_frames() {
+        let prepared = prepare_pocket_prompt("yep").expect("non-empty");
+        let extra = build_generation_extra(&prepared).expect("short prompts get extra");
+        // Exactly one key — `max_frames` — and nothing else.
+        assert_eq!(extra.len(), 1, "extra has unexpected keys: {extra:?}");
+        assert_eq!(
+            extra.get("max_frames"),
+            Some(&serde_json::Value::from(SHORT_PROMPT_MAX_FRAMES))
+        );
+        assert!(
+            !extra.contains_key("frames_after_eos"),
+            "frames_after_eos must never be set — upstream default of {SHERPA_ONNX_FRAMES_AFTER_EOS_DEFAULT} is what we want"
+        );
     }
 
     #[test]
-    fn estimate_max_frames_saturates_at_ceiling() {
-        // 5000 words ≈ a runaway prompt; must clamp at the hard ceiling.
-        assert_eq!(estimate_max_frames(5_000), MAX_FRAMES_HARD_CEILING);
+    fn build_extra_long_prompt_is_none() {
+        // ≥5 words: no extras at all. This is the key fix for the "first
+        // 'yep' in 'Yep, I can hear you. …' is static" regression — we
+        // were previously forcing `frames_after_eos = 1` on this path.
+        let prepared = prepare_pocket_prompt("Yep, I can hear you.").expect("non-empty");
+        assert_eq!(
+            build_generation_extra(&prepared),
+            None,
+            "long prompts must not override any LM knob"
+        );
     }
 
     #[test]
-    fn estimate_max_frames_grows_with_word_count() {
-        let small = estimate_max_frames(2);
-        let medium = estimate_max_frames(20);
-        let large = estimate_max_frames(100);
-        assert!(small < medium);
-        assert!(medium < large);
-        assert!(large <= MAX_FRAMES_HARD_CEILING);
+    fn build_extra_never_lowers_frames_after_eos_for_any_word_count() {
+        // Sweep a range of prompt lengths and assert the `extra` map (when
+        // present) never carries a `frames_after_eos` override that's lower
+        // than the upstream sherpa-onnx default. Implemented as a structural
+        // check — we just never set the key — but worth a property test in
+        // case someone reintroduces the override in the future.
+        let prompts: &[&str] = &[
+            "hi",
+            "hi there",
+            "yes please",
+            "one two three four",
+            "one two three four five",
+            "a slightly longer reply, hopefully fine",
+            "This is a multi-clause sentence. It has two parts.",
+            "really really really really really long prompt with lots of words just to be sure",
+        ];
+        for &p in prompts {
+            let prepared = prepare_pocket_prompt(p).expect("non-empty");
+            if let Some(extra) = build_generation_extra(&prepared) {
+                if let Some(v) = extra.get("frames_after_eos") {
+                    let n = v.as_i64().expect("frames_after_eos should be int");
+                    assert!(
+                        n >= SHERPA_ONNX_FRAMES_AFTER_EOS_DEFAULT as i64,
+                        "prompt {p:?} set frames_after_eos={n}, below upstream default of {SHERPA_ONNX_FRAMES_AFTER_EOS_DEFAULT}"
+                    );
+                }
+            }
+        }
     }
 
     #[test]
-    fn estimate_max_frames_never_zero() {
-        // Sanity: even a 0-word prompt yields ≥1 frame so we never ask the
-        // engine for an impossible cap.
-        assert!(estimate_max_frames(0) >= 1);
+    fn short_prompt_max_frames_is_below_upstream_default() {
+        // Sanity: the override only ever *lowers* the cap, never raises it.
+        assert!(SHORT_PROMPT_MAX_FRAMES < SHERPA_ONNX_MAX_FRAMES_DEFAULT);
+        // …and is still large enough for a one-to-four-word reply. At Mimi's
+        // 12.5 Hz frame rate, 100 frames = 8 s, which is roomy.
+        assert!(SHORT_PROMPT_MAX_FRAMES >= 50, "would risk truncation");
     }
 }

From f570ec0912ddc661a2eff7d88e2b769484552ad8 Mon Sep 17 00:00:00 2001
From: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6
 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
Date: Mon, 18 May 2026 11:33:38 -0400
Subject: [PATCH 05/10] huddle(tts): drop leading fade-in to preserve consonant
 onsets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symptom (reported 2026-05-18): "the first little sound or two in a
sentence is kind of getting skipped over." Affects every sentence,
not just the first one, and is independent of sentence length.

Root cause: `apply_fades` applied an 8 ms linear fade-in (192 samples
at 24 kHz) to the start of every synthesised sentence. Probing Pocket
TTS output across four prompts (Y/H/W/T onsets — see
`examples/pocket_onset_probe.rs`) shows real audio energy inside the
first millisecond:

  prompt                       | samples[0] | peak@1ms  rms@1ms
  ─────────────────────────────────────────────────────────────
  Yep, I can hear you.         |   0.00193  |  0.0331   0.0235
  Hello there friend.          |   0.00185  |  0.0288   0.0191
  What can I help with?        |   0.00180  |  0.0358   0.0242
  Try this experiment now.     |   0.00189  |  0.0197   0.0139

For 'Yep' and 'What' the first-1ms RMS is *equal to or greater than*
the first-5ms RMS — the consonant attack peaks inside the very window
the fade was nuking. A 0→1 linear ramp attenuated those onset samples
by ≥6 dB over the first 4 ms, which is exactly what Tyler heard as
"swallowed sounds".

`samples[0]` ≈ 0.0019 (≈ −54 dBFS) is far below any audible
DC-jump-click threshold, so removing the fade-in does not introduce
clicks. Fade-out is retained because end-of-sentence cuts *do* create
audible clicks when a non-zero waveform terminates abruptly.

Changes (3 files, +101/−24):

- `huddle/tts.rs`:
  - Rename `apply_fades` → `apply_fade_out`. Body removes the leading
    fade loop and operates on `&mut [f32]` instead of `&mut Vec<f32>`.
  - New const `FIRST_APPEND_LEAD_IN_SAMPLES = 480` (20 ms) and a
    single-shot `player.append(zeros)` at the `first_append` site, so
    the OS audio device / rodio mixer gets a quiet ramp-up window
    *without* scaling any real synthesis samples. Applied once per
    utterance — sentence boundaries continue to use
    `INTER_SENTENCE_SILENCE` (100 ms) and don't stack on this cushion.
  - New regression test
    `apply_fade_out_does_not_touch_leading_samples` locks in
    `samples[0..FADE_OUT_SAMPLES]` are byte-equal to input. Will
    fail loudly if anyone ever reintroduces a leading fade.
  - `first_append_lead_in_is_sane` pins the 20 ms × 24 kHz = 480
    constant and documents why that range is reasonable.
  - Existing `apply_fades_*` tests renamed and updated; +2 net tests
    (24 → 26 in tts.rs; 317 → 319 lib-wide).

- `examples/pocket_onset_probe.rs` (new, 137 lines): synthesises the
  four probe prompts, dumps per-prompt onset stats (samples[0],
  peak/RMS @ 1ms/5ms/20ms), and writes raw WAVs to /tmp for offline
  inspection. Documents the measurement that justifies removing the
  fade-in; runs against the same `/tmp/pocket-tts-bench` model
  directory `pocket_bench` uses.

- `desktop/scripts/check-file-sizes.mjs`: bump `tts.rs` override
  1130 → 1210 with updated description.

Verification before push:

- `cargo test --lib` (full) → 319/319 pass.
- `cargo fmt --check` clean.
- `cargo check` (desktop tauri crate) clean.
- `pnpm check` (biome + file-sizes) clean.
- Manual A/B not yet done from the worktree — Tyler will hear the
  result on `cargo run` after pull.

Discussion: thread root c0f5988e in #sprout-desktop-lighter-tts
(initial diagnosis, Max's review of approach, probe data).

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
Signed-off-by: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
---
 desktop/scripts/check-file-sizes.mjs          |   2 +-
 .../src-tauri/examples/pocket_onset_probe.rs  | 148 ++++++++++++++++++
 desktop/src-tauri/src/huddle/tts.rs           | 122 ++++++++++++---
 3 files changed, 247 insertions(+), 25 deletions(-)
 create mode 100644 desktop/src-tauri/examples/pocket_onset_probe.rs

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index 8a514ae2b..40c30e0a7 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -72,7 +72,7 @@ const overrides = new Map([
   ["src-tauri/src/huddle/pocket.rs", 620], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + build_generation_extra (only overrides max_frames, and only for ≤4-word inputs, to bound runaway "monster breathing" generation without clipping multi-clause sentences) + 12 unit tests including regressions for the static-burst-on-multi-clause-sentence bug
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
-  ["src-tauri/src/huddle/tts.rs", 1130], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fades + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 24 unit tests (18 interrupt + 6 fade/normalize)
+  ["src-tauri/src/huddle/tts.rs", 1210], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fade_out (fade-out only — leading fade removed 2026-05-18 after onset-attenuation regression measured in examples/pocket_onset_probe.rs) + FIRST_APPEND_LEAD_IN_SAMPLES (20 ms zero cushion at the first-append site to give the OS audio device a quiet ramp-up window without scaling real samples) + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 26 unit tests (18 interrupt + 5 fade-out + 1 first-append-lead-in + 5 normalize)
   ["src-tauri/src/relay.rs", 510], // +4 lines for NIP-OA auth tag injection in profile sync (build_profile_event) + verification test
   ["src-tauri/src/commands/pairing.rs", 600], // NIP-AB pairing actor: 3 Tauri commands + background WS task + NIP-42 auth + NIP-43 probe + event parsing helpers
   ["src-tauri/src/lib.rs", 715], // +4 lines for PairingHandle managed state + 3 pairing command registrations
diff --git a/desktop/src-tauri/examples/pocket_onset_probe.rs b/desktop/src-tauri/examples/pocket_onset_probe.rs
new file mode 100644
index 000000000..348cd8d83
--- /dev/null
+++ b/desktop/src-tauri/examples/pocket_onset_probe.rs
@@ -0,0 +1,148 @@
+//! Onset-attenuation probe for Pocket TTS.
+//!
+//! Synthesises a handful of short sentences and dumps per-sentence onset
+//! statistics (samples[0], 1ms/5ms/20ms peak + RMS) so we can decide whether
+//! the production `apply_fades` 8 ms fade-in is masking real audio.
+//!
+//! Also writes the raw (un-faded, un-normalised) audio of each sentence to
+//! /tmp so they can be inspected in Audacity / aplay without rodio in the
+//! loop.
+//!
+//! Run with model files in /tmp/pocket-tts-bench (override with arg 1):
+//!   cargo run --release --example pocket_onset_probe
+//!   cargo run --release --example pocket_onset_probe /path/to/pocket-tts
+
+use std::path::PathBuf;
+
+use sherpa_onnx::{
+    self, GenerationConfig, OfflineTts, OfflineTtsConfig, OfflineTtsModelConfig,
+    OfflineTtsPocketModelConfig, Wave,
+};
+
+const SAMPLE_RATE: u32 = 24_000;
+
+/// Test prompts chosen to span different onsets:
+/// - palatal glide 'Y' (soft onset)
+/// - voiceless fricative 'H' (very soft onset)
+/// - labio-velar glide 'W' (medium onset)
+/// - voiceless stop 'T' (hard onset)
+const PROMPTS: &[&str] = &[
+    "Yep, I can hear you.",
+    "Hello there friend.",
+    "What can I help with?",
+    "Try this experiment now.",
+];
+
+fn main() {
+    let model_dir = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| "/tmp/pocket-tts-bench".to_string());
+    eprintln!("Model dir: {model_dir}");
+
+    let dir = PathBuf::from(&model_dir);
+    let p = |name: &str| dir.join(name).to_string_lossy().into_owned();
+
+    let mut cfg = OfflineTtsConfig::default();
+    cfg.model = OfflineTtsModelConfig {
+        pocket: OfflineTtsPocketModelConfig {
+            lm_main: Some(p("lm_main.int8.onnx")),
+            lm_flow: Some(p("lm_flow.int8.onnx")),
+            encoder: Some(p("encoder.onnx")),
+            decoder: Some(p("decoder.int8.onnx")),
+            text_conditioner: Some(p("text_conditioner.onnx")),
+            vocab_json: Some(p("vocab.json")),
+            token_scores_json: Some(p("token_scores.json")),
+            voice_embedding_cache_capacity: 16,
+        },
+        num_threads: 1,
+        debug: false,
+        ..Default::default()
+    };
+    let engine = OfflineTts::create(&cfg).expect("engine create");
+
+    let voice_path = dir.join("reference_sample.wav");
+    let wave = Wave::read(voice_path.to_str().unwrap()).expect("voice WAV");
+    let voice_samples = wave.samples().to_vec();
+    let voice_sr = wave.sample_rate();
+
+    // Warmup so we're not measuring cold-call jitter.
+    {
+        let cfg = GenerationConfig {
+            speed: 1.05,
+            num_steps: 1,
+            silence_scale: 0.0,
+            reference_audio: Some(voice_samples.clone()),
+            reference_sample_rate: voice_sr,
+            ..Default::default()
+        };
+        let _ = engine.generate_with_config("warmup.", &cfg, None::<fn(&[f32], f32) -> bool>);
+    }
+
+    println!(
+        "{:<28} | {:>10} | {:>10} {:>10} | {:>10} {:>10} | {:>10} {:>10}",
+        "prompt",
+        "samples[0]",
+        "peak@1ms",
+        "rms@1ms",
+        "peak@5ms",
+        "rms@5ms",
+        "peak@20ms",
+        "rms@20ms"
+    );
+    println!("{}", "-".repeat(120));
+
+    for prompt in PROMPTS {
+        // Mirror the production prompt-prep (capitalise + terminal punctuation).
+        // These prompts already have it, so this is just to match what
+        // sherpa-onnx sees in production.
+        let cfg = GenerationConfig {
+            speed: 1.05,
+            num_steps: 1,
+            silence_scale: 0.0,
+            reference_audio: Some(voice_samples.clone()),
+            reference_sample_rate: voice_sr,
+            ..Default::default()
+        };
+        let out = engine
+            .generate_with_config(prompt, &cfg, None::<fn(&[f32], f32) -> bool>)
+            .expect("synth");
+        let samples = out.samples();
+
+        let n_1ms = (SAMPLE_RATE as f32 * 0.001) as usize;
+        let n_5ms = (SAMPLE_RATE as f32 * 0.005) as usize;
+        let n_20ms = (SAMPLE_RATE as f32 * 0.020) as usize;
+
+        let stats = |range: &[f32]| -> (f32, f32) {
+            if range.is_empty() {
+                return (0.0, 0.0);
+            }
+            let peak = range.iter().fold(0.0_f32, |a, &x| a.max(x.abs()));
+            let sumsq: f32 = range.iter().map(|x| x * x).sum();
+            let rms = (sumsq / range.len() as f32).sqrt();
+            (peak, rms)
+        };
+
+        let first = samples.first().copied().unwrap_or(0.0);
+        let (p1, r1) = stats(&samples[..n_1ms.min(samples.len())]);
+        let (p5, r5) = stats(&samples[..n_5ms.min(samples.len())]);
+        let (p20, r20) = stats(&samples[..n_20ms.min(samples.len())]);
+
+        println!(
+            "{:<28} | {:>10.6} | {:>10.6} {:>10.6} | {:>10.6} {:>10.6} | {:>10.6} {:>10.6}",
+            prompt, first, p1, r1, p5, r5, p20, r20
+        );
+
+        // Dump raw WAV for inspection.
+        let safe: String = prompt
+            .chars()
+            .map(|c| if c.is_ascii_alphanumeric() { c } else { '_' })
+            .collect();
+        let out_path = format!("/tmp/pocket_onset_{}.wav", &safe[..safe.len().min(24)]);
+        let _ = sherpa_onnx::write(&out_path, samples, SAMPLE_RATE as i32);
+        eprintln!(
+            "  → wrote {out_path} ({} samples = {:.3} s)",
+            samples.len(),
+            samples.len() as f32 / SAMPLE_RATE as f32
+        );
+    }
+}
diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index 843b76ae5..78c6b7975 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -70,9 +70,24 @@ const TARGET_PEAK: f32 = 0.501_187_2; // 10f32.powf(-6.0 / 20.0)
 /// still catching pathological near-silent buffers.
 const MAX_GAIN: f32 = 8.0;
 
-/// Fade in/out length in samples (8ms at 24kHz ≈ 192 samples).
-/// Eliminates clicks/pops at sentence boundaries.
-const FADE_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.008) as usize;
+/// Fade-out length in samples (8 ms at 24 kHz ≈ 192 samples).
+///
+/// Applied only at the *end* of each synthesised sentence to eliminate the
+/// click that would otherwise occur when a non-zero waveform terminates
+/// abruptly. **No fade-in is applied** — see `apply_fade_out` for the
+/// rationale and `examples/pocket_onset_probe.rs` for the measurement that
+/// motivated removing the leading fade.
+const FADE_OUT_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.008) as usize;
+
+/// Length of the zero-sample cushion prepended to the very first audio
+/// buffer of an utterance, so the OS audio device / rodio mixer has a
+/// fully-quiet ramp-up window before the real onset hits.
+///
+/// Applied at the `first_append` site only — *not* per sentence — so it
+/// doesn't stack on top of `INTER_SENTENCE_SILENCE` at sentence boundaries.
+/// 20 ms ≈ 480 samples is enough to cover a CoreAudio buffer turnover
+/// without being audible as latency.
+const FIRST_APPEND_LEAD_IN_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.020) as usize;
 
 /// Sentence-by-sentence synthesis — keeps first-sentence latency low and lets
 /// playback of sentence N overlap with synthesis of sentence N+1 (see the
@@ -385,7 +400,19 @@ fn tts_worker(
             match engine.synth_chunk(text, "en", &style, SYNTH_STEPS, SYNTH_SPEED) {
                 Ok(samples) if !samples.is_empty() => {
                     let mut boosted = normalize_for_playback(samples);
-                    apply_fades(&mut boosted);
+                    // Fade-out only — fading-in would attenuate the consonant
+                    // onset (see `apply_fade_out` docstring + the
+                    // 2026-05-18 "first little sound is missing" regression).
+                    apply_fade_out(&mut boosted);
+                    if first_append {
+                        // Pre-pad the very first buffer of an utterance with
+                        // a brief silence so the OS audio device / rodio
+                        // mixer has a fully-quiet ramp-up window before the
+                        // first real sample. Applied once per utterance —
+                        // sentence boundaries use INTER_SENTENCE_SILENCE.
+                        let lead_in = vec![0.0f32; FIRST_APPEND_LEAD_IN_SAMPLES];
+                        player.append(SamplesBuffer::new(channels, rate, lead_in));
+                    }
                     player.append(SamplesBuffer::new(channels, rate, boosted));
                     // Insert inter-sentence silence after each synthesized chunk.
                     player.append(SamplesBuffer::new(channels, rate, silence_buf.clone()));
@@ -474,18 +501,31 @@ fn normalize_for_playback(samples: Vec<f32>) -> Vec<f32> {
         .collect()
 }
 
-/// Apply a short linear fade-in at the start and fade-out at the end of `samples`.
+/// Apply a short linear fade-out at the *end* of `samples`.
+///
+/// Uses `FADE_OUT_SAMPLES` (8 ms) or half the buffer length, whichever is
+/// smaller. Eliminates the click that occurs when a non-zero waveform
+/// terminates abruptly at a sentence boundary.
 ///
-/// Uses `FADE_SAMPLES` (8ms) or half the buffer length, whichever is smaller.
-/// Eliminates clicks/pops at sentence boundaries.
-fn apply_fades(samples: &mut Vec<f32>) {
+/// # Why no fade-in
+///
+/// An earlier revision (pre 2026-05) symmetrically faded *in* over the same
+/// 8 ms window. That swallowed the leading consonant attack on every
+/// sentence — Pocket TTS produces real audio energy inside the first
+/// millisecond (RMS ≈ 0.02, peak ≈ 0.03 measured across four prompts in
+/// `examples/pocket_onset_probe.rs`), and a linear 0→1 ramp over 192 samples
+/// scales those onset samples by ≤50 % for the first ~4 ms. The result was
+/// the "first little sound or two is missing" regression heard on
+/// 2026-05-18.
+///
+/// The first sample of Pocket output measures ≈ 0.0018 (≈ −54 dBFS) — well
+/// below the threshold at which a DC-jump would be audible as a click — so
+/// no fade-in is needed. The OS audio device gets its quiet ramp-up window
+/// from `FIRST_APPEND_LEAD_IN_SAMPLES` instead, applied once per utterance
+/// at the `first_append` site.
+fn apply_fade_out(samples: &mut [f32]) {
     let len = samples.len();
-    let fade = FADE_SAMPLES.min(len / 2);
-    // Fade in: ramp from 0 → 1 over `fade` samples.
-    for i in 0..fade {
-        samples[i] *= i as f32 / fade as f32;
-    }
-    // Fade out: ramp from 1 → 0 over the last `fade` samples.
+    let fade = FADE_OUT_SAMPLES.min(len / 2);
     for i in 0..fade {
         samples[len - 1 - i] *= i as f32 / fade as f32;
     }
@@ -1036,31 +1076,65 @@ mod tests {
         );
     }
 
-    // ── apply_fades tests ─────────────────────────────────────────────────────
+    // ── apply_fade_out tests ──────────────────────────────────────────────────
 
+    /// The fade-out half of the old `apply_fades`: last sample is silenced
+    /// and the ramp is monotonic. Mid-buffer must be untouched.
     #[test]
-    fn apply_fades_short_buffer() {
+    fn apply_fade_out_short_buffer() {
         let mut samples = vec![1.0f32; 10];
-        apply_fades(&mut samples);
-        assert_eq!(samples[0], 0.0);
-        assert_eq!(samples[9], 0.0);
-        assert!(samples[5] > 0.5);
+        apply_fade_out(&mut samples);
+        assert_eq!(samples[9], 0.0, "last sample should be silenced");
+        assert!(samples[5] > 0.5, "mid-buffer should be near-untouched");
     }
 
+    /// REGRESSION (2026-05-18): the *first* samples must NOT be attenuated.
+    /// An earlier `apply_fades` symmetrically faded in over 8 ms which
+    /// swallowed the consonant onset of every sentence.
+    /// Lock in: samples[0..FADE_OUT_SAMPLES] are byte-equal to input.
     #[test]
-    fn apply_fades_empty_buffer() {
+    fn apply_fade_out_does_not_touch_leading_samples() {
+        // Input long enough that fade window doesn't overlap (≫ 2× fade).
+        let n = FADE_OUT_SAMPLES * 4;
+        let input: Vec<f32> = (0..n).map(|i| 0.5 + (i as f32) * 1e-4).collect();
+        let mut samples = input.clone();
+        apply_fade_out(&mut samples);
+        for i in 0..FADE_OUT_SAMPLES {
+            assert_eq!(
+                samples[i], input[i],
+                "leading sample {i} must not be attenuated (was {} → {})",
+                input[i], samples[i]
+            );
+        }
+        // And the trailing fade still works.
+        assert_eq!(samples[n - 1], 0.0);
+    }
+
+    #[test]
+    fn apply_fade_out_empty_buffer() {
         let mut samples: Vec<f32> = vec![];
-        apply_fades(&mut samples);
+        apply_fade_out(&mut samples);
         assert!(samples.is_empty());
     }
 
     #[test]
-    fn apply_fades_single_sample() {
+    fn apply_fade_out_single_sample() {
+        // fade = min(FADE_OUT_SAMPLES, len/2) = 0, so nothing changes.
         let mut samples = vec![1.0f32];
-        apply_fades(&mut samples);
+        apply_fade_out(&mut samples);
         assert_eq!(samples[0], 1.0);
     }
 
+    /// Sanity-check the first-append cushion length: 20 ms at 24 kHz must
+    /// land at exactly 480 samples. This is a const computation, so the
+    /// real value of this test is documenting *why* 20 ms was chosen — it
+    /// covers a typical CoreAudio buffer turnover (256–1024 samples)
+    /// without being audible as user-facing latency.
+    #[test]
+    fn first_append_lead_in_is_sane() {
+        assert_eq!(FIRST_APPEND_LEAD_IN_SAMPLES, 480, "20 ms × 24 kHz");
+    }
+
     // ── normalize_for_playback tests ──────────────────────────────────────────
 
     /// A quiet buffer (peak well under TARGET_PEAK) is scaled up to TARGET_PEAK.

From 93f4d1c75d9efb2d0152e8573db7bfd314935193 Mon Sep 17 00:00:00 2001
From: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6
 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
Date: Mon, 18 May 2026 11:38:57 -0400
Subject: [PATCH 06/10] huddle(tts): test lead-in pad fires once per utterance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to f570ec0 addressing Max's PR review feedback: the
`FIRST_APPEND_LEAD_IN_SAMPLES` cushion is correctly gated by
`if first_append` in the worker loop, but nothing in the test suite
catches the only-bad-version of that pad — a future refactor moving
the `if first_append` check inside the per-sentence loop would
silently stack 20 ms on top of `INTER_SENTENCE_SILENCE` at every
sentence boundary and audibly slow multi-sentence utterances.

Refactor the append decision into a pure helper and pin the
invariant:

- New `build_sentence_append_plan(first_append, boosted, silence_len)
  -> Vec<Vec<f32>>`: returns [lead_in, audio, inter_silence] on the
  first call (flipping `first_append` to false), or [audio,
  inter_silence] on every subsequent call. The worker loop now calls
  this and iterates the returned buffers, instead of conditionally
  appending inline.

- Three new tests:
  - `lead_in_pad_fires_exactly_once_per_utterance` — pumps 5
    sentences through the plan builder, counts the lead-in buffers,
    asserts exactly 1. The regression test Max specifically asked
    for.
  - `build_sentence_append_plan_flips_first_append` — pins the
    flag-mutation contract.
  - `first_sentence_leading_silence_is_exactly_lead_in` — asserts
    the lead-in is the only leading-silence buffer (no
    inter-sentence silence is emitted before the first audio
    buffer).

The worker-loop call site is now ~7 lines shorter and harder to
break: `was_first` snapshots the flag for the `tts_active.store`
gate, the plan builder owns the rest.

Verification:

- `cargo test --lib` → 322/322 pass (was 319 → +3 new).
- `cargo fmt --check` clean.
- `cargo check` (desktop tauri crate) clean.
- `pnpm check` (biome + file-sizes) clean.

Discussion: thread root c0f5988e in #sprout-desktop-lighter-tts,
specifically Max's messages [11]/[12]/[13] requesting the
once-per-utterance test before merge.

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
Signed-off-by: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
---
 desktop/scripts/check-file-sizes.mjs |   2 +-
 desktop/src-tauri/src/huddle/tts.rs  | 151 ++++++++++++++++++++++++---
 2 files changed, 139 insertions(+), 14 deletions(-)

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index 40c30e0a7..0b33a086a 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -72,7 +72,7 @@ const overrides = new Map([
   ["src-tauri/src/huddle/pocket.rs", 620], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + build_generation_extra (only overrides max_frames, and only for ≤4-word inputs, to bound runaway "monster breathing" generation without clipping multi-clause sentences) + 12 unit tests including regressions for the static-burst-on-multi-clause-sentence bug
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
-  ["src-tauri/src/huddle/tts.rs", 1210], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fade_out (fade-out only — leading fade removed 2026-05-18 after onset-attenuation regression measured in examples/pocket_onset_probe.rs) + FIRST_APPEND_LEAD_IN_SAMPLES (20 ms zero cushion at the first-append site to give the OS audio device a quiet ramp-up window without scaling real samples) + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 26 unit tests (18 interrupt + 5 fade-out + 1 first-append-lead-in + 5 normalize)
+  ["src-tauri/src/huddle/tts.rs", 1335], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fade_out (fade-out only — leading fade removed 2026-05-18 after onset-attenuation regression measured in examples/pocket_onset_probe.rs) + FIRST_APPEND_LEAD_IN_SAMPLES + build_sentence_append_plan (pure helper enforcing the lead-in fires exactly once per utterance, not per sentence — see lead_in_pad_fires_exactly_once_per_utterance regression test) + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 29 unit tests (18 interrupt + 5 fade-out + 1 first-append-lead-in + 3 build-sentence-append-plan + 5 normalize)
   ["src-tauri/src/relay.rs", 510], // +4 lines for NIP-OA auth tag injection in profile sync (build_profile_event) + verification test
   ["src-tauri/src/commands/pairing.rs", 600], // NIP-AB pairing actor: 3 Tauri commands + background WS task + NIP-42 auth + NIP-43 probe + event parsing helpers
   ["src-tauri/src/lib.rs", 715], // +4 lines for PairingHandle managed state + 3 pairing command registrations
diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index 78c6b7975..bcb074018 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -404,21 +404,21 @@ fn tts_worker(
                     // onset (see `apply_fade_out` docstring + the
                     // 2026-05-18 "first little sound is missing" regression).
                     apply_fade_out(&mut boosted);
-                    if first_append {
-                        // Pre-pad the very first buffer of an utterance with
-                        // a brief silence so the OS audio device / rodio
-                        // mixer has a fully-quiet ramp-up window before the
-                        // first real sample. Applied once per utterance —
-                        // sentence boundaries use INTER_SENTENCE_SILENCE.
-                        let lead_in = vec![0.0f32; FIRST_APPEND_LEAD_IN_SAMPLES];
-                        player.append(SamplesBuffer::new(channels, rate, lead_in));
+
+                    // Decide what to append, including the one-shot lead-in
+                    // pad on the first sentence. Centralised in
+                    // `build_sentence_append_plan` so the "lead-in fires
+                    // exactly once per utterance" invariant is testable
+                    // without a rodio mock. See its docstring + the
+                    // `lead_in_pad_fires_exactly_once_per_utterance` test.
+                    let was_first = first_append;
+                    let plan =
+                        build_sentence_append_plan(&mut first_append, boosted, silence_buf.len());
+                    for buf in plan {
+                        player.append(SamplesBuffer::new(channels, rate, buf));
                     }
-                    player.append(SamplesBuffer::new(channels, rate, boosted));
-                    // Insert inter-sentence silence after each synthesized chunk.
-                    player.append(SamplesBuffer::new(channels, rate, silence_buf.clone()));
-                    if first_append {
+                    if was_first {
                         tts_active.store(true, Ordering::Release);
-                        first_append = false;
                     }
                 }
                 Ok(_) => {}
@@ -531,6 +531,36 @@ fn apply_fade_out(samples: &mut [f32]) {
     }
 }
 
+/// Build the ordered list of buffers to append to the rodio `Player` for one
+/// synthesised sentence, including the one-shot lead-in pad on the *first*
+/// sentence of an utterance.
+///
+/// Returns either three buffers (lead-in pad + audio + inter-sentence
+/// silence) on the first call of an utterance, or two buffers (audio +
+/// inter-sentence silence) on every subsequent call. Flips
+/// `*first_append` from `true` → `false` after producing the lead-in.
+///
+/// Extracted from the worker loop so the "lead-in fires exactly once per
+/// utterance" invariant is testable without mocking rodio. See
+/// `lead_in_pad_fires_exactly_once_per_utterance` for the regression test
+/// that catches the only-bad-version of this pad: accidentally moving it
+/// inside the per-sentence loop and stacking on top of
+/// `INTER_SENTENCE_SILENCE` at every sentence boundary.
+fn build_sentence_append_plan(
+    first_append: &mut bool,
+    boosted: Vec<f32>,
+    silence_buf_len: usize,
+) -> Vec<Vec<f32>> {
+    let mut plan = Vec::with_capacity(3);
+    if *first_append {
+        plan.push(vec![0.0f32; FIRST_APPEND_LEAD_IN_SAMPLES]);
+        *first_append = false;
+    }
+    plan.push(boosted);
+    plan.push(vec![0.0f32; silence_buf_len]);
+    plan
+}
+
 // drain_until_shutdown lives in super (huddle/mod.rs) — shared with stt.rs.
 use super::drain_until_shutdown;
 
@@ -1135,6 +1165,101 @@ mod tests {
         assert_eq!(FIRST_APPEND_LEAD_IN_SAMPLES, 480, "20 ms × 24 kHz");
     }
 
+    // ── build_sentence_append_plan tests ──────────────────────────────────────
+
+    /// REGRESSION (Max, 2026-05-18 review): the lead-in pad must fire
+    /// **exactly once per utterance**, never per sentence. Catches the
+    /// only-bad-version of this pad — accidentally moving the
+    /// `if first_append` check inside the per-sentence loop, which would
+    /// stack 20 ms of silence on top of `INTER_SENTENCE_SILENCE` at every
+    /// sentence boundary and audibly slow down multi-sentence utterances.
+    #[test]
+    fn lead_in_pad_fires_exactly_once_per_utterance() {
+        const SENTENCE_AUDIO_LEN: usize = 1000;
+        const SILENCE_BUF_LEN: usize = 240; // arbitrary; matches a 10 ms inter-sentence buffer
+        const N_SENTENCES: usize = 5;
+
+        let mut first = true;
+        let mut total_lead_in_buffers = 0;
+        let mut total_audio_buffers = 0;
+        let mut total_inter_silence_buffers = 0;
+
+        for i in 0..N_SENTENCES {
+            let plan = build_sentence_append_plan(
+                &mut first,
+                vec![0.5_f32; SENTENCE_AUDIO_LEN],
+                SILENCE_BUF_LEN,
+            );
+
+            if i == 0 {
+                assert_eq!(
+                    plan.len(),
+                    3,
+                    "first sentence emits [lead-in, audio, inter-silence]"
+                );
+                assert_eq!(plan[0].len(), FIRST_APPEND_LEAD_IN_SAMPLES);
+                assert!(
+                    plan[0].iter().all(|&s| s == 0.0),
+                    "lead-in pad must be pure silence"
+                );
+                assert_eq!(plan[1].len(), SENTENCE_AUDIO_LEN);
+                assert_eq!(plan[2].len(), SILENCE_BUF_LEN);
+                total_lead_in_buffers += 1;
+                total_audio_buffers += 1;
+                total_inter_silence_buffers += 1;
+            } else {
+                assert_eq!(
+                    plan.len(),
+                    2,
+                    "subsequent sentences emit [audio, inter-silence] only"
+                );
+                assert_eq!(plan[0].len(), SENTENCE_AUDIO_LEN);
+                assert_eq!(plan[1].len(), SILENCE_BUF_LEN);
+                total_audio_buffers += 1;
+                total_inter_silence_buffers += 1;
+            }
+        }
+
+        assert_eq!(
+            total_lead_in_buffers, 1,
+            "lead-in must fire exactly once per utterance, not {} times",
+            total_lead_in_buffers
+        );
+        assert_eq!(total_audio_buffers, N_SENTENCES);
+        assert_eq!(total_inter_silence_buffers, N_SENTENCES);
+        assert!(!first, "first_append flag must be cleared after first call");
+    }
+
+    /// The plan flips `first_append` from true → false on the very first
+    /// call, so subsequent calls produce no lead-in even if called with the
+    /// same mutable flag.
+    #[test]
+    fn build_sentence_append_plan_flips_first_append() {
+        let mut first = true;
+        let _ = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
+        assert!(!first, "first call must flip the flag");
+
+        // Subsequent call: no lead-in, flag stays false.
+        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
+        assert_eq!(plan.len(), 2);
+        assert!(!first);
+    }
+
+    /// Total leading silence on the first sentence is *exactly* the lead-in
+    /// pad — it does NOT double-count the inter-sentence silence at the
+    /// start of the plan. Inter-sentence silence is emitted *after* audio,
+    /// never before. (Max's [12] concern.)
+    #[test]
+    fn first_sentence_leading_silence_is_exactly_lead_in() {
+        let mut first = true;
+        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 240);
+        // First buffer is lead-in pad, exactly FIRST_APPEND_LEAD_IN_SAMPLES.
+        assert_eq!(plan[0].len(), FIRST_APPEND_LEAD_IN_SAMPLES);
+        // Second buffer is the audio (non-silent), so total leading silence
+        // before any audio is heard is exactly the lead-in.
+        assert!(plan[1].iter().any(|&s| s != 0.0));
+    }
+
     // ── normalize_for_playback tests ──────────────────────────────────────────
 
     /// A quiet buffer (peak well under TARGET_PEAK) is scaled up to TARGET_PEAK.

From 61d064d02cdc011b60e495dac26920ea6ac1850e Mon Sep 17 00:00:00 2001
From: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6
 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
Date: Mon, 18 May 2026 12:57:21 -0400
Subject: [PATCH 07/10] huddle(tts): fix first-phoneme drop with sacrificial
 prefix + trim
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pocket TTS' FlowLM has an autoregressive cold-start: the first 2-3
generation steps run without audio context in the KV cache, occasionally
smearing or dropping the first phoneme of short utterances. Tyler
reproduced this on 'I'm happy.' rendering as 'm happy', and on other
'I'm X' constructions across random seeds. The bug is documented
upstream as kyutai-labs/pocket-tts #91 (8 comments, 2 collaborators
acknowledged) and #70, with collateral discussion at sherpa-onnx #3180.

Earlier commits in this branch reduced but did not eliminate the
failure: 773a2a1 added 8-space padding; 1dbfa2c restored sherpa's
`frames_after_eos` default of 3 (fixing a separate static-burst
regression); f570ec0 dropped the leading fade-in. Empirical study at
production settings (silence_scale=0, frames_after_eos=3 default)
confirmed that temperature, silence_scale, seed, and pad tweaks are
all insufficient — the model's stochastic sampling lands on a bad
trajectory often enough to be perceptible on short prompts.

This commit applies the upstream-documented sacrificial-word workaround
(ikidd in kyutai-labs/pocket-tts #70) with two refinements:

  1. Sacrificial prefix '. . ' (two periods + space) instead of a word.
     The pair was empirically the only variant in our probe that
     produced a usable post-sacrificial silence gap on every random
     seed in the 8-seed × 8-variant matrix (`sacrificial_probe`,
     iterated locally during investigation); a single period failed on
     seed=99999. Periods render as low-amplitude breath rather than
     spoken audio.

  2. Post-synth trim: scan from t=30ms looking for the first run of
     samples below 0.02 lasting >= 50 ms — that's the sacrificial→main
     boundary. `Vec::drain` everything before the gap-end. If no gap
     is found or the boundary lies beyond 1.2 s (production max-drop
     bound), bail out and emit the raw buffer rather than corrupt the
     audio. We don't insert a zero lead-in here because tts.rs's
     existing FIRST_APPEND_LEAD_IN_SAMPLES already provides the
     OS-device warm-up cushion on the first append of an utterance,
     and subsequent sentences are buffered by INTER_SENTENCE_SILENCE.

Both the prefix and the trim are gated on PreparedPrompt::is_short
(<= 4 words after preprocessing, matches upstream's
pad_with_spaces_for_short_inputs predicate). Long prompts pass through
unchanged: the first phoneme of a long utterance has enough downstream
context to avoid the smear, and a natural early pause like the comma in
'Hello, how can I help you?' would otherwise be misdetected by the
trimmer as the sacrificial gap (Max caught this in review — thanks).

Also: bump TARGET_PEAK in tts.rs from -6 dBFS (0.501) to -3 dBFS
(0.708) per Tyler. This is a ceiling on per-sentence loudness
normalization, not a floor — quieter Pocket utterances under MAX_GAIN=8
will still land below the ceiling (bench-typical peak 0.076 lands at
0.608, ~-4.3 dBFS). Comment updated to reflect that nuance.

Probe data (see examples/prod_probe.rs; production GenerationConfig:
silence_scale=0.0, frames_after_eos default 3, max_frames=100 short).
Tested 5 prompts × 5 seeds with the new code path:

  Short prompts ('I'm happy', 'I'm sorry', 'I'm ready', 'Yep',
  'I see you') with sacrificial prefix:
    25/25 produced a >=50ms silence gap in the 30-340ms range.
    Trim drops 47-339ms; final audio 270-748ms.

  Long prompts without sacrificial (regression check):
    'Hello, how can I help you today?' and 'Yes, that works. Let me
    try again.' generate normally; comma pauses preserved.

Tyler ear-confirmed the trimmed short-prompt output:
  > these are much better! I like this!

Max reviewed twice — first flagging a silence_scale mismatch between
probe (silence_scale=1.0) and production (0.0), then flagging the
destructive-edge hazard if trim ran on un-sacrificed long prompts.
Both are addressed: prod_probe mirrors production GenerationConfig
exactly (silence_scale=0.0, no frames_after_eos override per 1dbfa2c),
and the trim is gated on is_short with a 1.2s max-drop bound as
belt-and-suspenders against the destructive edge case.

Tests added (in pocket.rs):
  - prepare_prompt_inserts_sacrificial_prefix_only_for_short:
    pins the exact ordering (pad + '. . ' + cleaned).
  - prepare_prompt_threshold_is_inclusive_at_four_words extended to
    assert is_short and SACRIFICIAL_PREFIX absence on long input.
  - trim_strips_sacrificial_and_keeps_only_speech: feed a synthetic
    sacrificial+gap+speech buffer; assert leading sample is speech.
  - trim_is_noop_when_no_long_silence_gap_exists
  - trim_is_noop_when_gap_is_shorter_than_threshold
  - trim_is_noop_when_gap_is_beyond_max_drop_bound: guards the
    destructive-edge case Max flagged.
  - trim_is_noop_on_buffer_smaller_than_scan_start: no panic.
  - trim_constants_use_sane_units: pins millisecond meanings.

Tests added (in tts.rs):
  - normalize_for_playback_clamps_at_max_gain_below_target: new
    behaviour under the -3 dBFS ceiling for bench-typical peaks.
  - normalize_for_playback_hits_target_on_quiet_buffer updated for
    new MAX_GAIN saturation point (0.0885) on the input side.

All 330 cargo test --lib pass. cargo fmt --check and
desktop/scripts/check-file-sizes.mjs are green. pocket.rs cap 620 →
900, tts.rs cap 1335 → 1380.

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
Signed-off-by: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
---
 desktop/scripts/check-file-sizes.mjs     |   4 +-
 desktop/src-tauri/examples/prod_probe.rs | 158 +++++++++++
 desktop/src-tauri/src/huddle/pocket.rs   | 318 +++++++++++++++++++++--
 desktop/src-tauri/src/huddle/tts.rs      |  57 +++-
 4 files changed, 502 insertions(+), 35 deletions(-)
 create mode 100644 desktop/src-tauri/examples/prod_probe.rs

diff --git a/desktop/scripts/check-file-sizes.mjs b/desktop/scripts/check-file-sizes.mjs
index 0b33a086a..54c924c11 100644
--- a/desktop/scripts/check-file-sizes.mjs
+++ b/desktop/scripts/check-file-sizes.mjs
@@ -69,10 +69,10 @@ const overrides = new Map([
   ["src-tauri/src/huddle/mod.rs", 1020], // huddle state machine + Tauri commands + sync protocol doc; state/relay/pipeline extracted + emit_huddle_state_changed wiring
   ["src-tauri/src/huddle/models.rs", 950], // model download manager for Parakeet TDT-CTC STT + Pocket TTS with streaming downloads + SHA-256 verification + Rust-native tar extraction + version manifest + atomic swap + hot-start signaling + MODEL_LICENSE.txt sidecar (fail-closed readiness) + idempotent legacy Moonshine dir cleanup + tts_readiness_requires_license_sidecar test + Mary (VCTK p333) reference voice attribution block
   ["src-tauri/src/huddle/stt.rs", 580], // STT pipeline + PTT edge-detection flush + PTT gating (is_speech AND ptt_active) + barge-in for VAD mode + rubato resampler + earshot VAD + sherpa-onnx transcription
-  ["src-tauri/src/huddle/pocket.rs", 620], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs, mirror upstream pocket-tts prepare_text_prompt) + build_generation_extra (only overrides max_frames, and only for ≤4-word inputs, to bound runaway "monster breathing" generation without clipping multi-clause sentences) + 12 unit tests including regressions for the static-burst-on-multi-clause-sentence bug
+  ["src-tauri/src/huddle/pocket.rs", 900], // Pocket TTS engine wrapper + prepare_pocket_prompt (capitalize/punctuate/pad short inputs + insert ". . " sacrificial cold-start prefix, mirror upstream pocket-tts prepare_text_prompt) + build_generation_extra (only overrides max_frames, and only for ≤4-word inputs, to bound runaway "monster breathing" generation without clipping multi-clause sentences) + trim_leading_cold_start (post-synth strip of sacrificial audio on short prompts, workaround for kyutai-labs/pocket-tts #91 first-phoneme drop) + 18 unit tests including regressions for the static-burst-on-multi-clause-sentence bug and the first-phoneme-drop trim hazards
   ["src-tauri/src/huddle/preprocessing.rs", 670], // TTS text preprocessing pipeline + unified split_sentences + int_to_words 0-999999 + URL trailing punctuation preservation + 23 unit tests
   ["src-tauri/src/huddle/relay_api.rs", 520], // audio relay recv task + per-peer frame counting for remote human TTS interrupt + NIP-98 channel member query
-  ["src-tauri/src/huddle/tts.rs", 1335], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fade_out (fade-out only — leading fade removed 2026-05-18 after onset-attenuation regression measured in examples/pocket_onset_probe.rs) + FIRST_APPEND_LEAD_IN_SAMPLES + build_sentence_append_plan (pure helper enforcing the lead-in fires exactly once per utterance, not per sentence — see lead_in_pad_fires_exactly_once_per_utterance regression test) + normalize_for_playback (per-sentence peak normalization to -6 dBFS with MAX_GAIN cap) + 29 unit tests (18 interrupt + 5 fade-out + 1 first-append-lead-in + 3 build-sentence-append-plan + 5 normalize)
+  ["src-tauri/src/huddle/tts.rs", 1380], // TTS pipeline + session warmup + cancel/shutdown handling + apply_fade_out (fade-out only — leading fade removed 2026-05-18 after onset-attenuation regression measured in examples/pocket_onset_probe.rs) + FIRST_APPEND_LEAD_IN_SAMPLES + build_sentence_append_plan (pure helper enforcing the lead-in fires exactly once per utterance, not per sentence — see lead_in_pad_fires_exactly_once_per_utterance regression test) + normalize_for_playback (per-sentence peak normalization to -3 dBFS ceiling with MAX_GAIN cap) + 30 unit tests (18 interrupt + 5 fade-out + 1 first-append-lead-in + 3 build-sentence-append-plan + 6 normalize)
   ["src-tauri/src/relay.rs", 510], // +4 lines for NIP-OA auth tag injection in profile sync (build_profile_event) + verification test
   ["src-tauri/src/commands/pairing.rs", 600], // NIP-AB pairing actor: 3 Tauri commands + background WS task + NIP-42 auth + NIP-43 probe + event parsing helpers
   ["src-tauri/src/lib.rs", 715], // +4 lines for PairingHandle managed state + 3 pairing command registrations
diff --git a/desktop/src-tauri/examples/prod_probe.rs b/desktop/src-tauri/examples/prod_probe.rs
new file mode 100644
index 000000000..9a8798608
--- /dev/null
+++ b/desktop/src-tauri/examples/prod_probe.rs
@@ -0,0 +1,158 @@
+//! Reproduction probe for the Pocket TTS first-phoneme-drop fix.
+//!
+//! Pocket TTS' FlowLM has an autoregressive cold-start that occasionally
+//! smears or drops the first phoneme of short utterances — see
+//! kyutai-labs/pocket-tts #91, #70 and sherpa-onnx #3180. The cure is to
+//! prepend a sacrificial `". . "` cold-start absorber to short prompts
+//! and trim the resulting leading audio. This example reproduces both
+//! variants of generated audio so you can listen-test the fix at the
+//! exact `GenerationConfig` we ship in production (`huddle::pocket`):
+//!
+//!   - silence_scale: 0.0     (production)
+//!   - max_frames:    100     (short) / sherpa default 500 (long)
+//!   - num_steps: 1
+//!   - speed: 1.05
+//!
+//! Note: production does NOT override `frames_after_eos` — sherpa-onnx's
+//! default of 3 is what we want. The previous attempt to override it for
+//! long prompts caused the "first 'yep' is static" regression (commit
+//! 1dbfa2c). This probe mirrors that decision.
+//!
+//! Run:
+//!   cargo run --release --example prod_probe
+//!   cargo run --release --example prod_probe /path/to/pocket-tts
+//!
+//! Output: /tmp/prod_<label>_s<seed>.wav. The "no sacrificial" variants
+//! show what production produced before the fix; the "_sac" variants show
+//! the new path. Listen back with `afplay`.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use sherpa_onnx::{
+    self, GenerationConfig, OfflineTts, OfflineTtsConfig, OfflineTtsModelConfig,
+    OfflineTtsPocketModelConfig, Wave,
+};
+
+const SAMPLE_RATE: u32 = 24_000;
+const SHORT_PROMPT_MAX_FRAMES: i32 = 100;
+
+// (label, raw_text_before_prep, sacrificial_prefix_to_add_after_pad)
+const TESTS: &[(&str, &str, &str)] = &[
+    // Short, previously-failing — sacrificial applied
+    ("imhappy_sac", "I'm happy.", ". . "),
+    ("imsorry_sac", "I'm sorry.", ". . "),
+    ("imready_sac", "I'm ready.", ". . "),
+    // Short, previously-OK — sacrificial applied
+    ("yep_sac", "Yep.", ". . "),
+    ("isee_sac", "I see you.", ". . "),
+    // Long — sacrificial NOT applied (per design)
+    ("longer_nosac", "Hello, how can I help you today?", ""),
+    ("multi_nosac", "Yes, that works. Let me try again.", ""),
+];
+
+fn main() {
+    let model_dir = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| "/tmp/pocket-tts-bench".to_string());
+    let dir = PathBuf::from(&model_dir);
+    let p = |name: &str| dir.join(name).to_string_lossy().into_owned();
+
+    let mut cfg = OfflineTtsConfig::default();
+    cfg.model = OfflineTtsModelConfig {
+        pocket: OfflineTtsPocketModelConfig {
+            lm_main: Some(p("lm_main.int8.onnx")),
+            lm_flow: Some(p("lm_flow.int8.onnx")),
+            encoder: Some(p("encoder.onnx")),
+            decoder: Some(p("decoder.int8.onnx")),
+            text_conditioner: Some(p("text_conditioner.onnx")),
+            vocab_json: Some(p("vocab.json")),
+            token_scores_json: Some(p("token_scores.json")),
+            voice_embedding_cache_capacity: 16,
+        },
+        num_threads: 1,
+        debug: false,
+        ..Default::default()
+    };
+    let engine = OfflineTts::create(&cfg).expect("engine create");
+
+    let voice_path = dir.join("reference_sample.wav");
+    let wave = Wave::read(voice_path.to_str().unwrap()).expect("voice WAV");
+    let ref_samples = wave.samples().to_vec();
+    let ref_sr = wave.sample_rate();
+
+    let seeds: &[i32] = &[42, 1337, 99999, 7, 314159];
+
+    println!(
+        "{:18} | {:>6} | {:>7} | gap_search (50ms,0.02) | path",
+        "test", "seed", "len_ms"
+    );
+    println!("{}", "-".repeat(100));
+
+    for (label, raw_text, sacrificial) in TESTS {
+        // Mirror huddle::pocket::prepare_pocket_prompt:
+        //   - cleaned text starts with capital, ends with punctuation
+        //   - short (≤4 words) → pad + sacrificial; max_frames=100
+        //   - long  → unchanged; no max_frames override
+        let cleaned = raw_text.trim();
+        let word_count = cleaned.split_whitespace().count();
+        let is_short = word_count <= 4;
+        let pad = if is_short { "        " } else { "" };
+        let prompt = format!("{pad}{sacrificial}{cleaned}");
+
+        for seed in seeds {
+            let mut extra: HashMap<String, serde_json::Value> = HashMap::new();
+            extra.insert("seed".to_string(), serde_json::Value::from(*seed));
+            if is_short {
+                extra.insert(
+                    "max_frames".to_string(),
+                    serde_json::Value::from(SHORT_PROMPT_MAX_FRAMES),
+                );
+            }
+            let gen = GenerationConfig {
+                speed: 1.05,
+                num_steps: 1,
+                silence_scale: 0.0, // PRODUCTION SETTING
+                reference_audio: Some(ref_samples.clone()),
+                reference_sample_rate: ref_sr,
+                extra: Some(extra),
+                ..Default::default()
+            };
+            let audio = engine
+                .generate_with_config(&prompt, &gen, None::<fn(&[f32], f32) -> bool>)
+                .expect("synth");
+            let samples = audio.samples();
+            let len_ms = samples.len() as f32 / SAMPLE_RATE as f32 * 1000.0;
+            let gap = find_gap(samples, SAMPLE_RATE, 0.02, 50);
+            let path = format!("/tmp/prod_{}_s{}.wav", label, seed);
+            sherpa_onnx::write(&path, samples, SAMPLE_RATE as i32);
+            println!(
+                "{:18} | {:>6} | {:>5.0}ms | {:>22} | {}",
+                label, seed, len_ms, gap, path
+            );
+        }
+        println!();
+    }
+}
+
+fn find_gap(samples: &[f32], sr: u32, thresh: f32, min_ms: u32) -> String {
+    let scan_start = (sr as usize * 30) / 1000;
+    let min_samples = (sr as usize * min_ms as usize) / 1000;
+    let mut silence_from: Option<usize> = None;
+    for i in scan_start..samples.len() {
+        if samples[i].abs() < thresh {
+            silence_from.get_or_insert(i);
+        } else if let Some(start) = silence_from {
+            if i - start >= min_samples {
+                return format!(
+                    "{:.0}..{:.0}ms ({:.0}ms)",
+                    start as f32 / sr as f32 * 1000.0,
+                    i as f32 / sr as f32 * 1000.0,
+                    (i - start) as f32 / sr as f32 * 1000.0
+                );
+            }
+            silence_from = None;
+        }
+    }
+    "<no gap>".to_string()
+}
diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
index 73f004beb..042361b00 100644
--- a/desktop/src-tauri/src/huddle/pocket.rs
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -114,6 +114,59 @@ const SHORT_PROMPT_WORD_THRESHOLD: usize = 4;
 /// uses exactly 8 — keep parity rather than tuning blindly.
 const SHORT_PROMPT_PAD_SPACES: usize = 8;
 
+/// Sacrificial cold-start prefix appended *after* the leading space pad for
+/// short prompts. Pocket TTS' FlowLM autoregressive generation has a 2–3
+/// step "settle" period at the start where the first generated phoneme can
+/// be smeared or dropped entirely (see kyutai-labs/pocket-tts #91, #70 and
+/// sherpa-onnx #3180). For short utterances like "I'm happy." the first
+/// phoneme is most of the first word — losing it produces "m happy".
+///
+/// Two periods separated by a space act as a "phantom utterance" that the
+/// model commits to, absorbing the cold-start. The pair (rather than a
+/// single period) was empirically the only variant in our probe — see
+/// `examples/prod_probe.rs` — that produced a usable post-sacrificial
+/// silence gap on every random seed. The resulting leading sacrificial
+/// audio is then stripped from the output by [`trim_leading_cold_start`]
+/// before the buffer is returned to the synth pipeline.
+///
+/// Long prompts (>4 words) don't need this — the first phoneme already has
+/// enough downstream context to avoid the smear, and an early natural pause
+/// (e.g. the comma in "Hello, how can I help you?") could be misdetected as
+/// the trim boundary.
+const SACRIFICIAL_PREFIX: &str = ". . ";
+
+// ── Leading cold-start trim (post-synth) ──────────────────────────────────────
+
+/// Skip this many samples at the start of the synth buffer before looking
+/// for the sacrificial→main silence gap. The Mimi decoder cold-start
+/// produces ~30 ms of low-amplitude noise that we *don't* want to treat as
+/// the gap. 30 ms × 24 kHz = 720 samples.
+const TRIM_SCAN_START_SAMPLES: usize = (SAMPLE_RATE as usize * 30) / 1000;
+
+/// Amplitude threshold below which a sample is considered "silence" for the
+/// purposes of finding the post-sacrificial gap. Tuned empirically against
+/// production-config probe data — the engine's own `ScaleSilence` uses 0.01,
+/// but our boundary detection wants a looser threshold so that the breath /
+/// aspiration of the rendered periods (which sits around 0.005–0.015) is
+/// treated as silence too.
+const TRIM_SILENCE_THRESHOLD: f32 = 0.02;
+
+/// A silence run must be at least this many samples long to be accepted as
+/// the sacrificial→main word boundary. 50 ms is comfortably longer than
+/// inter-syllable silence within a normal word at this speed (typically
+/// 10–30 ms), so this guards against trimming into the middle of the real
+/// utterance. 50 ms × 24 kHz = 1200 samples.
+const TRIM_MIN_GAP_SAMPLES: usize = (SAMPLE_RATE as usize * 50) / 1000;
+
+/// Hard cap on how much audio we'll trim from the start. Production probe
+/// data (with `silence_scale = 0.0`) shows valid trim boundaries land
+/// between 30 ms and ~450 ms; 1.2 s is a wide safety margin. If the
+/// detector finds a "gap" past this point it's almost certainly an interior
+/// pause inside an unusually long short-prompt utterance — bail out and
+/// emit untrimmed audio rather than corrupt it. 1.2 s × 24 kHz = 28800
+/// samples.
+const TRIM_MAX_DROP_SAMPLES: usize = (SAMPLE_RATE as usize * 1200) / 1000;
+
 /// sherpa-onnx's documented `frames_after_eos` default. We deliberately do
 /// *not* override this knob — the previous attempt to bump it for short
 /// inputs and lower it for long inputs lowered it below the upstream default
@@ -237,12 +290,20 @@ pub fn load_text_to_speech(model_dir: &str) -> Result<PocketTts, String> {
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) struct PreparedPrompt {
     /// Text to hand to `OfflineTts::generate_with_config`. Capitalized,
-    /// punctuation-terminated, and (for short inputs) left-padded with spaces.
+    /// punctuation-terminated, and (for short inputs) left-padded with spaces
+    /// plus a sacrificial `". . "` cold-start prefix.
     pub text: String,
     /// Value to pass via `GenerationConfig.extra["max_frames"]`, or `None` to
     /// keep the upstream default of 500 LM steps. We only override on short
     /// padded prompts where we have a tight expectation on output length.
     pub max_frames: Option<i32>,
+    /// `true` iff this prompt received the short-input treatment (leading
+    /// space pad + sacrificial `". . "` prefix). The synth pipeline uses
+    /// this to decide whether to apply [`trim_leading_cold_start`] to the
+    /// output: long prompts have no sacrificial audio to strip, and
+    /// trimming them risks deleting real speech at a natural early pause
+    /// (e.g. the comma in "Hello, how can I help you?").
+    pub is_short: bool,
 }
 
 /// Mirror of the *text-preparation* half of upstream
@@ -256,9 +317,11 @@ pub(crate) struct PreparedPrompt {
 ///    cheap to re-check after sentence splitting).
 /// 2. Capitalize the first letter.
 /// 3. Append `.` if the text doesn't end in punctuation.
-/// 4. If fewer than five words, prepend `SHORT_PROMPT_PAD_SPACES` spaces and
-///    return a tight [`SHORT_PROMPT_MAX_FRAMES`] cap so the LM can't run
-///    away if EOS still doesn't fire.
+/// 4. If fewer than five words, prepend `SHORT_PROMPT_PAD_SPACES` spaces
+///    followed by [`SACRIFICIAL_PREFIX`] (a `". . "` cold-start absorber —
+///    see its docstring for the bug it works around), and return a tight
+///    [`SHORT_PROMPT_MAX_FRAMES`] cap so the LM can't run away if EOS
+///    still doesn't fire.
 ///
 /// We do **not** override `frames_after_eos` — sherpa-onnx's default of 3
 /// is what we want. An earlier version set it to 1 on long inputs, which
@@ -317,10 +380,13 @@ pub(crate) fn prepare_pocket_prompt(input: &str) -> Option<PreparedPrompt> {
     let is_short = word_count <= SHORT_PROMPT_WORD_THRESHOLD;
 
     let (final_text, max_frames) = if is_short {
-        let mut padded = String::with_capacity(cleaned.len() + SHORT_PROMPT_PAD_SPACES);
+        let mut padded = String::with_capacity(
+            cleaned.len() + SHORT_PROMPT_PAD_SPACES + SACRIFICIAL_PREFIX.len(),
+        );
         for _ in 0..SHORT_PROMPT_PAD_SPACES {
             padded.push(' ');
         }
+        padded.push_str(SACRIFICIAL_PREFIX);
         padded.push_str(&cleaned);
         (padded, Some(SHORT_PROMPT_MAX_FRAMES))
     } else {
@@ -334,9 +400,71 @@ pub(crate) fn prepare_pocket_prompt(input: &str) -> Option<PreparedPrompt> {
     Some(PreparedPrompt {
         text: final_text,
         max_frames,
+        is_short,
     })
 }
 
+/// Strip the leading "sacrificial" audio produced by the `". . "` cold-start
+/// prefix from a short-prompt synthesis result. Only call this when
+/// [`PreparedPrompt::is_short`] is `true` — the trim looks for a long
+/// silence run at the head of the buffer, and an early natural pause inside
+/// a long unsacrificed utterance (e.g. the comma in "Hello, how can I help
+/// you?") would be misclassified as the sacrificial gap.
+///
+/// Algorithm:
+///   1. Skip the first [`TRIM_SCAN_START_SAMPLES`] of the buffer (Mimi
+///      cold-start noise we shouldn't classify as silence).
+///   2. Scan forward for the first run of samples below
+///      [`TRIM_SILENCE_THRESHOLD`] that lasts at least
+///      [`TRIM_MIN_GAP_SAMPLES`] — that's the post-sacrificial boundary.
+///   3. If that boundary lies beyond [`TRIM_MAX_DROP_SAMPLES`], treat it as
+///      "almost certainly an interior pause" and *do not trim* — the safe
+///      fallback is to play the slightly-degraded raw audio rather than
+///      delete real speech.
+///   4. Otherwise, drop the leading samples up to the end of the silence
+///      run. We don't insert a zero lead-in here — `tts.rs` already adds
+///      `FIRST_APPEND_LEAD_IN_SAMPLES` of zeros on the first append of an
+///      utterance, and subsequent sentences are buffered by
+///      `INTER_SENTENCE_SILENCE`.
+///
+/// If the scan never finds a long-enough gap (≈1% of generations in the
+/// production-config probe), the function is a no-op — the model trajectory
+/// missed the expected sacrificial→main structure and we'd rather play the
+/// raw buffer than emit silence.
+fn trim_leading_cold_start(samples: &mut Vec<f32>) {
+    if samples.len() <= TRIM_SCAN_START_SAMPLES {
+        return;
+    }
+
+    let mut silence_run_start: Option<usize> = None;
+    let mut gap_end: Option<usize> = None;
+    for (i, sample) in samples.iter().enumerate().skip(TRIM_SCAN_START_SAMPLES) {
+        if sample.abs() < TRIM_SILENCE_THRESHOLD {
+            silence_run_start.get_or_insert(i);
+        } else if let Some(start) = silence_run_start {
+            if i - start >= TRIM_MIN_GAP_SAMPLES {
+                gap_end = Some(i);
+                break;
+            }
+            silence_run_start = None;
+        }
+    }
+
+    let Some(end) = gap_end else {
+        // No gap found — model didn't produce the expected sacrificial→main
+        // structure. Bail out and let the caller play the raw buffer.
+        return;
+    };
+    if end > TRIM_MAX_DROP_SAMPLES {
+        // Boundary too far into the audio to plausibly be the sacrificial
+        // gap. Almost certainly an interior pause in the real utterance —
+        // leave the audio alone.
+        return;
+    }
+
+    samples.drain(..end);
+}
+
 /// Build the `GenerationConfig.extra` HashMap from a [`PreparedPrompt`].
 ///
 /// Centralised so the regression test below can assert that we **never**
@@ -417,7 +545,16 @@ impl PocketTts {
             );
         }
 
-        Ok(audio.samples().to_vec())
+        let mut samples = audio.samples().to_vec();
+        // For short prompts the prepared text includes a sacrificial ". . "
+        // prefix to absorb FlowLM/Mimi cold-start (see `SACRIFICIAL_PREFIX`).
+        // Strip the leading sacrificial audio before returning. Long prompts
+        // are never trimmed — they have no sacrificial audio, and an early
+        // natural pause could be mis-detected as the trim boundary.
+        if prepared.is_short {
+            trim_leading_cold_start(&mut samples);
+        }
+        Ok(samples)
     }
 }
 
@@ -434,14 +571,24 @@ mod tests {
         assert!(prepare_pocket_prompt("\n\t  ").is_none());
     }
 
+    /// Helper: the exact leading sequence prepended to every short prompt —
+    /// 8 spaces of padding followed by the sacrificial `". . "` cold-start
+    /// absorber. Centralising this keeps the assertions readable.
+    fn short_prefix() -> String {
+        let mut s = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        s.push_str(SACRIFICIAL_PREFIX);
+        s
+    }
+
     #[test]
     fn prepare_prompt_pads_and_capitalizes_one_word() {
         // The "yep" case Tyler hit in production — bare lowercase one-word
-        // utterance with no punctuation. Must be padded, capitalized, and
-        // terminated, with a tight `max_frames` cap to bound runaway gen.
+        // utterance with no punctuation. Must be padded with the short-prompt
+        // prefix (8 spaces + ". . " sacrificial), capitalized, terminated,
+        // with a tight `max_frames` cap to bound runaway gen.
         let out = prepare_pocket_prompt("yep").expect("non-empty");
-        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
-        assert_eq!(out.text, format!("{pad}Yep."));
+        assert_eq!(out.text, format!("{}Yep.", short_prefix()));
+        assert!(out.is_short, "1-word input is short");
         assert_eq!(out.max_frames, Some(SHORT_PROMPT_MAX_FRAMES));
         assert!(
             SHORT_PROMPT_MAX_FRAMES < SHERPA_ONNX_MAX_FRAMES_DEFAULT,
@@ -452,28 +599,38 @@ mod tests {
     #[test]
     fn prepare_prompt_preserves_existing_punctuation() {
         let out = prepare_pocket_prompt("yes!").expect("non-empty");
-        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
-        assert_eq!(out.text, format!("{pad}Yes!")); // exclamation kept
+        assert_eq!(out.text, format!("{}Yes!", short_prefix())); // exclamation kept
         let out = prepare_pocket_prompt("really?").expect("non-empty");
-        assert_eq!(out.text, format!("{pad}Really?"));
+        assert_eq!(out.text, format!("{}Really?", short_prefix()));
     }
 
     #[test]
     fn prepare_prompt_threshold_is_inclusive_at_four_words() {
-        // 4 words = short (padded + tight max_frames); 5 words = long
-        // (no padding, no overrides — upstream defaults stand).
+        // 4 words = short (padded + sacrificial + tight max_frames); 5 words
+        // = long (no padding, no sacrificial, no overrides — upstream
+        // defaults stand).
         let four = prepare_pocket_prompt("one two three four").expect("non-empty");
+        assert!(four.is_short, "four-word input should be short");
         assert!(
             four.text.starts_with(' '),
-            "four-word input should be padded"
+            "four-word input should start with the space pad"
+        );
+        assert!(
+            four.text.contains(SACRIFICIAL_PREFIX),
+            "four-word input should contain the sacrificial prefix"
         );
         assert_eq!(four.max_frames, Some(SHORT_PROMPT_MAX_FRAMES));
 
         let five = prepare_pocket_prompt("one two three four five").expect("non-empty");
+        assert!(!five.is_short, "five-word input should NOT be short");
         assert!(
             !five.text.starts_with(' '),
             "five-word input should NOT be padded"
         );
+        assert!(
+            !five.text.contains(SACRIFICIAL_PREFIX),
+            "five-word input must not receive the sacrificial prefix"
+        );
         assert_eq!(
             five.max_frames, None,
             "long inputs must leave sherpa-onnx's max_frames default in place"
@@ -484,7 +641,9 @@ mod tests {
     fn prepare_prompt_does_not_pad_long_text() {
         let long = "This is a longer sentence that the model should handle just fine.";
         let out = prepare_pocket_prompt(long).expect("non-empty");
+        assert!(!out.is_short);
         assert!(!out.text.starts_with(' '));
+        assert!(!out.text.contains(SACRIFICIAL_PREFIX));
         assert_eq!(out.max_frames, None);
         assert!(out.text.ends_with('.'));
     }
@@ -492,16 +651,15 @@ mod tests {
     #[test]
     fn prepare_prompt_collapses_whitespace() {
         let out = prepare_pocket_prompt("Hello    world\n\nfriend").expect("non-empty");
-        // No padding (3 words → short → padded), but interior is collapsed.
-        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
-        assert_eq!(out.text, format!("{pad}Hello world friend."));
+        // 3 words → short → padded + sacrificial. Interior whitespace
+        // collapsed.
+        assert_eq!(out.text, format!("{}Hello world friend.", short_prefix()));
     }
 
     #[test]
     fn prepare_prompt_does_not_double_capitalize_already_uppercase() {
         let out = prepare_pocket_prompt("HELLO there").expect("non-empty");
-        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
-        assert_eq!(out.text, format!("{pad}HELLO there."));
+        assert_eq!(out.text, format!("{}HELLO there.", short_prefix()));
     }
 
     #[test]
@@ -512,6 +670,18 @@ mod tests {
         assert!(out.text.contains("Дa."));
     }
 
+    #[test]
+    fn prepare_prompt_inserts_sacrificial_prefix_only_for_short() {
+        // Pinning the exact ordering: pad, then ". . ", then cleaned text.
+        // If this ever flips, the trim algorithm's calibration breaks.
+        let out = prepare_pocket_prompt("I'm happy.").expect("non-empty");
+        assert!(out.is_short);
+        let pad = " ".repeat(SHORT_PROMPT_PAD_SPACES);
+        let expected = format!("{pad}. . I'm happy.");
+        assert_eq!(out.text, expected);
+        assert_eq!(SACRIFICIAL_PREFIX, ". . ");
+    }
+
     // ── build_generation_extra ───────────────────────────────────────────────
     //
     // These tests pin down a behaviour we've now regressed twice on:
@@ -594,4 +764,110 @@ mod tests {
         // 12.5 Hz frame rate, 100 frames = 8 s, which is roomy.
         assert!(SHORT_PROMPT_MAX_FRAMES >= 50, "would risk truncation");
     }
+
+    // ── trim_leading_cold_start ──────────────────────────────────────────────
+
+    /// Build a synthetic buffer shaped like a sacrificial-prefixed Pocket TTS
+    /// output: a bit of "sacrificial" energy at the head (modelled as
+    /// alternating ±0.05 — above [`TRIM_SILENCE_THRESHOLD`] so it isn't
+    /// classified as silence, matching what real probe WAVs look like in the
+    /// 0–50 ms window), then a flat silence of `gap_ms` ms, then `tail_ms`
+    /// of "real speech" at peak `tail_peak`.
+    fn synth_buffer(sacrificial_ms: u32, gap_ms: u32, tail_ms: u32, tail_peak: f32) -> Vec<f32> {
+        let sr = SAMPLE_RATE as usize;
+        let mut v = Vec::new();
+        for i in 0..(sr * sacrificial_ms as usize / 1000) {
+            v.push(if i % 2 == 0 { 0.05 } else { -0.05 });
+        }
+        // Silence gap (true zeros — below TRIM_SILENCE_THRESHOLD).
+        v.extend(std::iter::repeat_n(0.0_f32, sr * gap_ms as usize / 1000));
+        // Real speech.
+        for i in 0..(sr * tail_ms as usize / 1000) {
+            v.push(if i % 2 == 0 { tail_peak } else { -tail_peak });
+        }
+        v
+    }
+
+    #[test]
+    fn trim_strips_sacrificial_and_keeps_only_speech() {
+        // 60 ms sacrificial + 100 ms gap + 500 ms speech at peak 0.3.
+        // After trim, the output is just the speech tail.
+        let mut v = synth_buffer(60, 100, 500, 0.3);
+        trim_leading_cold_start(&mut v);
+
+        // First sample should be speech (|s| ≥ 0.2). No zero lead-in here
+        // because tts.rs's `first_append` lead-in handles the device cushion.
+        assert!(
+            v[0].abs() > 0.2,
+            "first sample after trim should be speech, got {}",
+            v[0]
+        );
+        let actual_ms = (v.len() as f32 / SAMPLE_RATE as f32) * 1000.0;
+        assert!(
+            (actual_ms - 500.0).abs() < 5.0,
+            "expected ~500 ms of trimmed audio, got {actual_ms} ms"
+        );
+    }
+
+    #[test]
+    fn trim_is_noop_when_no_long_silence_gap_exists() {
+        // Pure speech: every sample is real (no gap >= 50 ms). Trimmer must
+        // leave the buffer untouched so we don't truncate the utterance.
+        let mut v = synth_buffer(0, 0, 600, 0.3);
+        let before = v.clone();
+        trim_leading_cold_start(&mut v);
+        assert_eq!(v, before, "no gap → no trim");
+    }
+
+    #[test]
+    fn trim_is_noop_when_gap_is_shorter_than_threshold() {
+        // 40 ms gap is below TRIM_MIN_GAP_SAMPLES (50 ms). Must not trigger.
+        let mut v = synth_buffer(60, 40, 600, 0.3);
+        let before = v.clone();
+        trim_leading_cold_start(&mut v);
+        assert_eq!(v, before, "sub-threshold gap → no trim");
+    }
+
+    #[test]
+    fn trim_is_noop_when_gap_is_beyond_max_drop_bound() {
+        // Gap starts at 1500 ms (past TRIM_MAX_DROP_SAMPLES = 1200 ms).
+        // This represents an interior pause inside an unusually long
+        // utterance that slipped past the short-prompt predicate; we must
+        // not chop the first 1.5 s of real audio.
+        let mut v = synth_buffer(1500, 200, 400, 0.3);
+        let before = v.clone();
+        trim_leading_cold_start(&mut v);
+        assert_eq!(v, before, "gap past max-drop bound → no trim");
+    }
+
+    #[test]
+    fn trim_is_noop_on_buffer_smaller_than_scan_start() {
+        // 20 ms buffer is smaller than TRIM_SCAN_START_SAMPLES (30 ms).
+        // Trimmer must early-return without panicking.
+        let mut v = vec![0.5f32; (SAMPLE_RATE as usize * 20) / 1000];
+        let before = v.clone();
+        trim_leading_cold_start(&mut v);
+        assert_eq!(v, before);
+    }
+
+    #[test]
+    fn trim_constants_use_sane_units() {
+        // Pin the constants in milliseconds so anyone tuning later can see
+        // at a glance what they're changing.
+        assert_eq!(
+            TRIM_SCAN_START_SAMPLES,
+            (SAMPLE_RATE as usize * 30) / 1000,
+            "scan-start should be 30 ms"
+        );
+        assert_eq!(
+            TRIM_MIN_GAP_SAMPLES,
+            (SAMPLE_RATE as usize * 50) / 1000,
+            "min-gap should be 50 ms"
+        );
+        assert_eq!(
+            TRIM_MAX_DROP_SAMPLES,
+            (SAMPLE_RATE as usize * 1200) / 1000,
+            "max-drop should be 1.2 s"
+        );
+    }
 }
diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index bcb074018..50d960811 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -54,11 +54,18 @@ const SYNTH_STEPS: usize = 1;
 /// Synthesis speed multiplier. Slightly faster than natural speech.
 const SYNTH_SPEED: f32 = 1.05;
 
-/// Target peak amplitude after per-sentence loudness normalization, in linear
-/// scale. −6 dBFS = 10^(−6/20) ≈ 0.501. Leaves 6 dB of headroom above the
-/// loudest sample so the subsequent fade-in/out and any system mixer gain
-/// don't have to soft-clip. See `normalize_for_playback`.
-const TARGET_PEAK: f32 = 0.501_187_2; // 10f32.powf(-6.0 / 20.0)
+/// Upper bound on per-sentence loudness normalization, in linear scale.
+/// −3 dBFS = 10^(−3/20) ≈ 0.708. This is a *ceiling*, not a floor: any
+/// sentence whose computed gain would push its peak above this is held at
+/// the ceiling, and quieter utterances under the [`MAX_GAIN`] cap may land
+/// below it. The ceiling leaves 3 dB of headroom above the loudest sample
+/// so the subsequent fade-out and any system mixer gain don't soft-clip.
+///
+/// Tyler bumped this from the previous −6 dBFS (0.501) to give Pocket TTS
+/// output some perceived loudness — the model's reference voice is quieter
+/// than Kokoro's was, and the prior target landed normal utterances around
+/// −9 dBFS once interior dynamics are accounted for.
+const TARGET_PEAK: f32 = 0.707_945_8; // 10f32.powf(-3.0 / 20.0)
 
 /// Maximum gain applied by `normalize_for_playback`. Caps amplification on
 /// near-silent buffers so a mid-utterance pause or a malformed synth doesn't
@@ -66,8 +73,10 @@ const TARGET_PEAK: f32 = 0.501_187_2; // 10f32.powf(-6.0 / 20.0)
 ///
 /// Pocket TTS reference-voice output measured ~7.6% peak on a 75-character
 /// utterance (`examples/pocket_bench`); a gain of 1/0.076 ≈ 6.6 lands that
-/// sample at the −6 dBFS target, so `8.0` covers normal utterances while
-/// still catching pathological near-silent buffers.
+/// sample at −6 dBFS and ≈9.3 at −3 dBFS. `8.0` covers normal utterances at
+/// the new ceiling while still catching pathological near-silent buffers
+/// (which will land at peak ≤ 0.076 × 8 = 0.61 — below the ceiling, which
+/// is the intended behaviour).
 const MAX_GAIN: f32 = 8.0;
 
 /// Fade-out length in samples (8 ms at 24 kHz ≈ 192 samples).
@@ -1262,13 +1271,16 @@ mod tests {
 
     // ── normalize_for_playback tests ──────────────────────────────────────────
 
-    /// A quiet buffer (peak well under TARGET_PEAK) is scaled up to TARGET_PEAK.
-    /// Reproduces the bench-measured Pocket TTS peak (~0.076) and asserts the
-    /// loudest sample lands at exactly the −6 dBFS target.
+    /// A moderately quiet buffer (peak comfortably under MAX_GAIN reach of
+    /// TARGET_PEAK) is scaled up to exactly TARGET_PEAK. With TARGET_PEAK at
+    /// −3 dBFS (0.708) and MAX_GAIN at 8.0, the gain saturation point is
+    /// peak = 0.708 / 8.0 ≈ 0.0885 — so a 0.1-peak buffer is in the linear
+    /// region and gets normalized cleanly.
     #[test]
     fn normalize_for_playback_hits_target_on_quiet_buffer() {
-        // peak 0.076 ⇒ gain ≈ 6.6, well under MAX_GAIN (8.0), so target hit.
-        let input: Vec<f32> = (0..100).map(|i| 0.076 * (i as f32 / 100.0)).collect();
+        // peak 0.1 ⇒ ideal gain 7.08, under MAX_GAIN (8.0), so target hit.
+        let mut input: Vec<f32> = (0..100).map(|i| 0.1 * (i as f32 / 100.0)).collect();
+        input.push(0.1); // ensure exact peak
         let out = normalize_for_playback(input);
         let peak = out.iter().fold(0.0_f32, |a, &s| a.max(s.abs()));
         assert!(
@@ -1277,6 +1289,27 @@ mod tests {
         );
     }
 
+    /// A buffer whose ideal gain exceeds MAX_GAIN gets clamped — the peak
+    /// lands at input_peak × MAX_GAIN, below TARGET_PEAK. With the new
+    /// −3 dBFS target this captures the Pocket-bench-typical case
+    /// (input peak 0.076 → ideal gain ≈ 9.3 > MAX_GAIN, so we clamp).
+    #[test]
+    fn normalize_for_playback_clamps_at_max_gain_below_target() {
+        let mut input: Vec<f32> = (0..100).map(|i| 0.076 * (i as f32 / 100.0)).collect();
+        input.push(0.076);
+        let out = normalize_for_playback(input);
+        let peak = out.iter().fold(0.0_f32, |a, &s| a.max(s.abs()));
+        let expected = 0.076 * MAX_GAIN;
+        assert!(
+            (peak - expected).abs() < 1e-3,
+            "expected peak ~{expected}, got {peak}"
+        );
+        assert!(
+            peak < TARGET_PEAK,
+            "clamped peak {peak} should be below TARGET_PEAK {TARGET_PEAK}"
+        );
+    }
+
     /// A near-silent buffer would need a huge gain to reach TARGET_PEAK;
     /// `MAX_GAIN` caps the amplification so we don't bring quantization noise
     /// up to full scale.

From 89b27d8fa59d87199494ddac9a8850eb5e12e6c1 Mon Sep 17 00:00:00 2001
From: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6
 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
Date: Mon, 18 May 2026 13:34:58 -0400
Subject: [PATCH 08/10] huddle(tts): prod_probe emits both raw and trimmed WAVs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Max noted that prod_probe's header advertised _sac variants as 'the new
path' but the WAVs written were actually the raw engine output, not the
post-trim audio that synth_chunk returns to tts.rs. Anyone listening to
those files would have heard the sacrificial breath at the start —
misleading for ear-testing the fix.

Mirror trim_leading_cold_start (and its constants) inline in the probe.
For short prompts with a sacrificial prefix, write both files:

  /tmp/prod_<label>_s<seed>_raw.wav      — raw engine output
  /tmp/prod_<label>_s<seed>_trimmed.wav  — what production actually plays

Long prompts (no sacrificial, no trim) only get the _raw variant since
that's what synth_chunk returns for them in production.

Header rewritten to match. Sample data after the change:

  imhappy_sac_s99999:  raw 472ms (gap 50..144ms) → trim 328ms
  yep_sac_s42:         raw 270ms (gap 30..141ms) → trim 129ms
  imhappy_sac_s314159: raw 730ms (gap 43..339ms) → trim 391ms

(Trim length == raw_len - gap_end_ms, matching expectations.)

The inline trim is a deliberate copy of huddle::pocket — the example
sits in desktop/src-tauri/examples which can't reach into the private
huddle module. Comment at the top of the constants block flags the
'keep in sync' contract.

All 330 cargo test --lib still pass; file-sizes still green.

Non-blocking cleanup from Max's review of 61d064d.

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
Signed-off-by: npub1cc3ha7z055mu0rwwu7806t2wt8mj3pvu0uv5mfp2c50dahaqhczshdalg6 <c6237ef84fa537c78dcee78efd2d4e59f728859c7f194da42ac51ededfa0be05@sprout-oss.stage.blox.sqprod.co>
---
 desktop/src-tauri/examples/prod_probe.rs | 86 ++++++++++++++++++++----
 1 file changed, 73 insertions(+), 13 deletions(-)

diff --git a/desktop/src-tauri/examples/prod_probe.rs b/desktop/src-tauri/examples/prod_probe.rs
index 9a8798608..cd79657d6 100644
--- a/desktop/src-tauri/examples/prod_probe.rs
+++ b/desktop/src-tauri/examples/prod_probe.rs
@@ -22,9 +22,15 @@
 //!   cargo run --release --example prod_probe
 //!   cargo run --release --example prod_probe /path/to/pocket-tts
 //!
-//! Output: /tmp/prod_<label>_s<seed>.wav. The "no sacrificial" variants
-//! show what production produced before the fix; the "_sac" variants show
-//! the new path. Listen back with `afplay`.
+//! Output (per (label, seed) pair):
+//!   /tmp/prod_<label>_s<seed>_raw.wav      — raw engine output
+//!   /tmp/prod_<label>_s<seed>_trimmed.wav  — post-trim (what production
+//!                                            ships, for `_sac` labels)
+//!
+//! The "no sacrificial" variants have no trim applied (just _raw); they
+//! show what production produces for long prompts. The "_sac" variants
+//! show both raw and trimmed, which is what `huddle::pocket::synth_chunk`
+//! returns for short prompts. Listen back with `afplay`.
 
 use std::collections::HashMap;
 use std::path::PathBuf;
@@ -37,6 +43,40 @@ use sherpa_onnx::{
 const SAMPLE_RATE: u32 = 24_000;
 const SHORT_PROMPT_MAX_FRAMES: i32 = 100;
 
+// Mirror of huddle::pocket trim constants so the probe stays in sync with
+// production. If you change either side, change both.
+const TRIM_SCAN_START_SAMPLES: usize = (SAMPLE_RATE as usize * 30) / 1000;
+const TRIM_SILENCE_THRESHOLD: f32 = 0.02;
+const TRIM_MIN_GAP_SAMPLES: usize = (SAMPLE_RATE as usize * 50) / 1000;
+const TRIM_MAX_DROP_SAMPLES: usize = (SAMPLE_RATE as usize * 1200) / 1000;
+
+/// Mirror of `huddle::pocket::trim_leading_cold_start` — keep in sync.
+fn trim_leading_cold_start(samples: &mut Vec<f32>) {
+    if samples.len() <= TRIM_SCAN_START_SAMPLES {
+        return;
+    }
+    let mut silence_run_start: Option<usize> = None;
+    let mut gap_end: Option<usize> = None;
+    for (i, sample) in samples.iter().enumerate().skip(TRIM_SCAN_START_SAMPLES) {
+        if sample.abs() < TRIM_SILENCE_THRESHOLD {
+            silence_run_start.get_or_insert(i);
+        } else if let Some(start) = silence_run_start {
+            if i - start >= TRIM_MIN_GAP_SAMPLES {
+                gap_end = Some(i);
+                break;
+            }
+            silence_run_start = None;
+        }
+    }
+    let Some(end) = gap_end else {
+        return;
+    };
+    if end > TRIM_MAX_DROP_SAMPLES {
+        return;
+    }
+    samples.drain(..end);
+}
+
 // (label, raw_text_before_prep, sacrificial_prefix_to_add_after_pad)
 const TESTS: &[(&str, &str, &str)] = &[
     // Short, previously-failing — sacrificial applied
@@ -84,10 +124,10 @@ fn main() {
     let seeds: &[i32] = &[42, 1337, 99999, 7, 314159];
 
     println!(
-        "{:18} | {:>6} | {:>7} | gap_search (50ms,0.02) | path",
-        "test", "seed", "len_ms"
+        "{:18} | {:>6} | {:>5} | {:>7} | gap_search (50ms,0.02) | path",
+        "test", "seed", "kind", "len_ms"
     );
-    println!("{}", "-".repeat(100));
+    println!("{}", "-".repeat(110));
 
     for (label, raw_text, sacrificial) in TESTS {
         // Mirror huddle::pocket::prepare_pocket_prompt:
@@ -99,6 +139,9 @@ fn main() {
         let is_short = word_count <= 4;
         let pad = if is_short { "        " } else { "" };
         let prompt = format!("{pad}{sacrificial}{cleaned}");
+        // Only short prompts get the post-synth trim in production; long
+        // prompts pass through unmodified.
+        let trim_in_production = is_short && !sacrificial.is_empty();
 
         for seed in seeds {
             let mut extra: HashMap<String, serde_json::Value> = HashMap::new();
@@ -121,15 +164,32 @@ fn main() {
             let audio = engine
                 .generate_with_config(&prompt, &gen, None::<fn(&[f32], f32) -> bool>)
                 .expect("synth");
-            let samples = audio.samples();
-            let len_ms = samples.len() as f32 / SAMPLE_RATE as f32 * 1000.0;
-            let gap = find_gap(samples, SAMPLE_RATE, 0.02, 50);
-            let path = format!("/tmp/prod_{}_s{}.wav", label, seed);
-            sherpa_onnx::write(&path, samples, SAMPLE_RATE as i32);
+
+            // Raw output (what the engine returned).
+            let raw_samples = audio.samples().to_vec();
+            let raw_ms = raw_samples.len() as f32 / SAMPLE_RATE as f32 * 1000.0;
+            let raw_gap = find_gap(&raw_samples, SAMPLE_RATE, 0.02, 50);
+            let raw_path = format!("/tmp/prod_{}_s{}_raw.wav", label, seed);
+            sherpa_onnx::write(&raw_path, &raw_samples, SAMPLE_RATE as i32);
             println!(
-                "{:18} | {:>6} | {:>5.0}ms | {:>22} | {}",
-                label, seed, len_ms, gap, path
+                "{:18} | {:>6} | {:>5} | {:>5.0}ms | {:>22} | {}",
+                label, seed, "raw", raw_ms, raw_gap, raw_path
             );
+
+            // Trimmed output — what synth_chunk actually returns to tts.rs
+            // for short prompts. For long prompts the engine output is
+            // returned untrimmed, so we skip writing a separate file.
+            if trim_in_production {
+                let mut trimmed = raw_samples;
+                trim_leading_cold_start(&mut trimmed);
+                let trimmed_ms = trimmed.len() as f32 / SAMPLE_RATE as f32 * 1000.0;
+                let trimmed_path = format!("/tmp/prod_{}_s{}_trimmed.wav", label, seed);
+                sherpa_onnx::write(&trimmed_path, &trimmed, SAMPLE_RATE as i32);
+                println!(
+                    "{:18} | {:>6} | {:>5} | {:>5.0}ms | {:>22} | {}",
+                    label, seed, "trim", trimmed_ms, "(post-trim)", trimmed_path
+                );
+            }
         }
         println!();
     }

From f077571ab019256de5f0e2462691589782148f7b Mon Sep 17 00:00:00 2001
From: Tyler Longwell <tlongwell@squareup.com>
Date: Mon, 18 May 2026 13:53:28 -0400
Subject: [PATCH 09/10] huddle(tts): cushion every sentence onset

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
---
 desktop/src-tauri/src/huddle/pocket.rs |   7 +-
 desktop/src-tauri/src/huddle/tts.rs    | 176 ++++++++++++++-----------
 2 files changed, 101 insertions(+), 82 deletions(-)

diff --git a/desktop/src-tauri/src/huddle/pocket.rs b/desktop/src-tauri/src/huddle/pocket.rs
index 042361b00..9aa040b64 100644
--- a/desktop/src-tauri/src/huddle/pocket.rs
+++ b/desktop/src-tauri/src/huddle/pocket.rs
@@ -422,10 +422,9 @@ pub(crate) fn prepare_pocket_prompt(input: &str) -> Option<PreparedPrompt> {
 ///      fallback is to play the slightly-degraded raw audio rather than
 ///      delete real speech.
 ///   4. Otherwise, drop the leading samples up to the end of the silence
-///      run. We don't insert a zero lead-in here — `tts.rs` already adds
-///      `FIRST_APPEND_LEAD_IN_SAMPLES` of zeros on the first append of an
-///      utterance, and subsequent sentences are buffered by
-///      `INTER_SENTENCE_SILENCE`.
+///      run. We don't insert a zero lead-in here — `tts.rs` owns playback
+///      cushioning by prepending `SENTENCE_LEAD_IN_SAMPLES` of zeros before
+///      each appended sentence chunk.
 ///
 /// If the scan never finds a long-enough gap (≈1% of generations in the
 /// production-config probe), the function is a no-op — the model trajectory
diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index 50d960811..fd57c18b0 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -88,15 +88,18 @@ const MAX_GAIN: f32 = 8.0;
 /// motivated removing the leading fade.
 const FADE_OUT_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.008) as usize;
 
-/// Length of the zero-sample cushion prepended to the very first audio
-/// buffer of an utterance, so the OS audio device / rodio mixer has a
-/// fully-quiet ramp-up window before the real onset hits.
+/// Length of the zero-sample cushion prepended before each synthesized
+/// sentence chunk, so the OS audio device / rodio mixer has a fully-quiet
+/// ramp-up window before the real onset hits.
 ///
-/// Applied at the `first_append` site only — *not* per sentence — so it
-/// doesn't stack on top of `INTER_SENTENCE_SILENCE` at sentence boundaries.
-/// 20 ms ≈ 480 samples is enough to cover a CoreAudio buffer turnover
-/// without being audible as latency.
-const FIRST_APPEND_LEAD_IN_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.020) as usize;
+/// This used to be applied only before the first sentence of a whole response.
+/// That still left later sentence chunks vulnerable to first-syllable clipping
+/// when their first phoneme was soft (notably `I'm` / `I've`) and rodio crossed
+/// from an explicit silence buffer straight into non-zero speech. 20 ms ≈ 480
+/// samples is enough to cover a CoreAudio buffer turnover without being audible
+/// as latency. At sentence boundaries this lead-in is budgeted out of the
+/// existing inter-sentence pause, so it does not lengthen multi-sentence gaps.
+const SENTENCE_LEAD_IN_SAMPLES: usize = (SAMPLE_RATE as f64 * 0.020) as usize;
 
 /// Sentence-by-sentence synthesis — keeps first-sentence latency low and lets
 /// playback of sentence N overlap with synthesis of sentence N+1 (see the
@@ -414,12 +417,11 @@ fn tts_worker(
                     // 2026-05-18 "first little sound is missing" regression).
                     apply_fade_out(&mut boosted);
 
-                    // Decide what to append, including the one-shot lead-in
-                    // pad on the first sentence. Centralised in
-                    // `build_sentence_append_plan` so the "lead-in fires
-                    // exactly once per utterance" invariant is testable
-                    // without a rodio mock. See its docstring + the
-                    // `lead_in_pad_fires_exactly_once_per_utterance` test.
+                    // Decide what to append, including the short lead-in pad
+                    // before this sentence chunk. Centralised in
+                    // `build_sentence_append_plan` so the "each chunk gets a
+                    // device cushion" invariant is testable without a rodio
+                    // mock. See its docstring and regression tests below.
                     let was_first = first_append;
                     let plan =
                         build_sentence_append_plan(&mut first_append, boosted, silence_buf.len());
@@ -530,8 +532,8 @@ fn normalize_for_playback(samples: Vec<f32>) -> Vec<f32> {
 /// The first sample of Pocket output measures ≈ 0.0018 (≈ −54 dBFS) — well
 /// below the threshold at which a DC-jump would be audible as a click — so
 /// no fade-in is needed. The OS audio device gets its quiet ramp-up window
-/// from `FIRST_APPEND_LEAD_IN_SAMPLES` instead, applied once per utterance
-/// at the `first_append` site.
+/// from `SENTENCE_LEAD_IN_SAMPLES` instead, inserted as pure silence before
+/// each sentence buffer.
 fn apply_fade_out(samples: &mut [f32]) {
     let len = samples.len();
     let fade = FADE_OUT_SAMPLES.min(len / 2);
@@ -541,32 +543,35 @@ fn apply_fade_out(samples: &mut [f32]) {
 }
 
 /// Build the ordered list of buffers to append to the rodio `Player` for one
-/// synthesised sentence, including the one-shot lead-in pad on the *first*
-/// sentence of an utterance.
+/// synthesised sentence.
 ///
-/// Returns either three buffers (lead-in pad + audio + inter-sentence
-/// silence) on the first call of an utterance, or two buffers (audio +
-/// inter-sentence silence) on every subsequent call. Flips
-/// `*first_append` from `true` → `false` after producing the lead-in.
+/// Every sentence chunk gets a short lead-in pad immediately before its audio.
+/// This matters for chunks that start with soft first phonemes (`I'm`, `I've`):
+/// the sacrificial-prefix trim intentionally returns audio whose first sample is
+/// already speech, so the playback layer must provide the device/mixer cushion.
+/// To keep the audible gap unchanged, the trailing silence after this chunk is
+/// shortened by the same amount (`silence_buf_len - SENTENCE_LEAD_IN_SAMPLES`):
+/// sentence N contributes 80 ms of post-speech silence and sentence N+1
+/// contributes the remaining 20 ms of pre-speech cushion.
 ///
-/// Extracted from the worker loop so the "lead-in fires exactly once per
-/// utterance" invariant is testable without mocking rodio. See
-/// `lead_in_pad_fires_exactly_once_per_utterance` for the regression test
-/// that catches the only-bad-version of this pad: accidentally moving it
-/// inside the per-sentence loop and stacking on top of
-/// `INTER_SENTENCE_SILENCE` at every sentence boundary.
+/// `first_append` is still accepted and flipped on the first call because the
+/// worker uses it to decide when actual playback has been queued and when it is
+/// safe to set `tts_active` for echo gating.
 fn build_sentence_append_plan(
     first_append: &mut bool,
     boosted: Vec<f32>,
     silence_buf_len: usize,
 ) -> Vec<Vec<f32>> {
     let mut plan = Vec::with_capacity(3);
+    plan.push(vec![0.0f32; SENTENCE_LEAD_IN_SAMPLES]);
     if *first_append {
-        plan.push(vec![0.0f32; FIRST_APPEND_LEAD_IN_SAMPLES]);
         *first_append = false;
     }
     plan.push(boosted);
-    plan.push(vec![0.0f32; silence_buf_len]);
+    plan.push(vec![
+        0.0f32;
+        silence_buf_len.saturating_sub(SENTENCE_LEAD_IN_SAMPLES)
+    ]);
     plan
 }
 
@@ -1164,28 +1169,27 @@ mod tests {
         assert_eq!(samples[0], 1.0);
     }
 
-    /// Sanity-check the first-append cushion length: 20 ms at 24 kHz must
+    /// Sanity-check the per-sentence cushion length: 20 ms at 24 kHz must
     /// land at exactly 480 samples. This is a const computation, so the
     /// real value of this test is documenting *why* 20 ms was chosen — it
     /// covers a typical CoreAudio buffer turnover (256–1024 samples)
     /// without being audible as user-facing latency.
     #[test]
-    fn first_append_lead_in_is_sane() {
-        assert_eq!(FIRST_APPEND_LEAD_IN_SAMPLES, 480, "20 ms × 24 kHz");
+    fn sentence_lead_in_is_sane() {
+        assert_eq!(SENTENCE_LEAD_IN_SAMPLES, 480, "20 ms × 24 kHz");
     }
 
     // ── build_sentence_append_plan tests ──────────────────────────────────────
 
-    /// REGRESSION (Max, 2026-05-18 review): the lead-in pad must fire
-    /// **exactly once per utterance**, never per sentence. Catches the
-    /// only-bad-version of this pad — accidentally moving the
-    /// `if first_append` check inside the per-sentence loop, which would
-    /// stack 20 ms of silence on top of `INTER_SENTENCE_SILENCE` at every
-    /// sentence boundary and audibly slow down multi-sentence utterances.
+    /// REGRESSION (Tyler, 2026-05-18): a response can contain later sentence
+    /// chunks that begin with soft first phonemes like `I'm` / `I've`. The
+    /// sacrificial-prefix trimmer returns those chunks with speech at sample 0,
+    /// so every appended sentence needs its own playback cushion, not just the
+    /// first sentence of the response.
     #[test]
-    fn lead_in_pad_fires_exactly_once_per_utterance() {
+    fn lead_in_pad_fires_for_every_sentence_chunk() {
         const SENTENCE_AUDIO_LEN: usize = 1000;
-        const SILENCE_BUF_LEN: usize = 240; // arbitrary; matches a 10 ms inter-sentence buffer
+        const SILENCE_BUF_LEN: usize = 2400; // 100 ms at 24 kHz, like production
         const N_SENTENCES: usize = 5;
 
         let mut first = true;
@@ -1193,45 +1197,37 @@ mod tests {
         let mut total_audio_buffers = 0;
         let mut total_inter_silence_buffers = 0;
 
-        for i in 0..N_SENTENCES {
+        for _ in 0..N_SENTENCES {
             let plan = build_sentence_append_plan(
                 &mut first,
                 vec![0.5_f32; SENTENCE_AUDIO_LEN],
                 SILENCE_BUF_LEN,
             );
 
-            if i == 0 {
-                assert_eq!(
-                    plan.len(),
-                    3,
-                    "first sentence emits [lead-in, audio, inter-silence]"
-                );
-                assert_eq!(plan[0].len(), FIRST_APPEND_LEAD_IN_SAMPLES);
-                assert!(
-                    plan[0].iter().all(|&s| s == 0.0),
-                    "lead-in pad must be pure silence"
-                );
-                assert_eq!(plan[1].len(), SENTENCE_AUDIO_LEN);
-                assert_eq!(plan[2].len(), SILENCE_BUF_LEN);
-                total_lead_in_buffers += 1;
-                total_audio_buffers += 1;
-                total_inter_silence_buffers += 1;
-            } else {
-                assert_eq!(
-                    plan.len(),
-                    2,
-                    "subsequent sentences emit [audio, inter-silence] only"
-                );
-                assert_eq!(plan[0].len(), SENTENCE_AUDIO_LEN);
-                assert_eq!(plan[1].len(), SILENCE_BUF_LEN);
-                total_audio_buffers += 1;
-                total_inter_silence_buffers += 1;
-            }
+            assert_eq!(
+                plan.len(),
+                3,
+                "every sentence emits [lead-in, audio, inter-silence]"
+            );
+            assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
+            assert!(
+                plan[0].iter().all(|&s| s == 0.0),
+                "lead-in pad must be pure silence"
+            );
+            assert_eq!(plan[1].len(), SENTENCE_AUDIO_LEN);
+            assert_eq!(
+                plan[2].len(),
+                SILENCE_BUF_LEN - SENTENCE_LEAD_IN_SAMPLES,
+                "trailing silence is shortened so lead-in + tail keeps the existing sentence gap"
+            );
+            total_lead_in_buffers += 1;
+            total_audio_buffers += 1;
+            total_inter_silence_buffers += 1;
         }
 
         assert_eq!(
-            total_lead_in_buffers, 1,
-            "lead-in must fire exactly once per utterance, not {} times",
+            total_lead_in_buffers, N_SENTENCES,
+            "lead-in must fire for every sentence chunk, not {} times",
             total_lead_in_buffers
         );
         assert_eq!(total_audio_buffers, N_SENTENCES);
@@ -1240,17 +1236,18 @@ mod tests {
     }
 
     /// The plan flips `first_append` from true → false on the very first
-    /// call, so subsequent calls produce no lead-in even if called with the
-    /// same mutable flag.
+    /// call so the worker still knows exactly when it has queued the first
+    /// playable audio and can set `tts_active` for echo gating.
     #[test]
     fn build_sentence_append_plan_flips_first_append() {
         let mut first = true;
         let _ = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
         assert!(!first, "first call must flip the flag");
 
-        // Subsequent call: no lead-in, flag stays false.
+        // Subsequent call: still has a per-sentence lead-in, flag stays false.
         let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
-        assert_eq!(plan.len(), 2);
+        assert_eq!(plan.len(), 3);
+        assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
         assert!(!first);
     }
 
@@ -1261,14 +1258,37 @@ mod tests {
     #[test]
     fn first_sentence_leading_silence_is_exactly_lead_in() {
         let mut first = true;
-        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 240);
-        // First buffer is lead-in pad, exactly FIRST_APPEND_LEAD_IN_SAMPLES.
-        assert_eq!(plan[0].len(), FIRST_APPEND_LEAD_IN_SAMPLES);
+        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 2400);
+        // First buffer is lead-in pad, exactly SENTENCE_LEAD_IN_SAMPLES.
+        assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
         // Second buffer is the audio (non-silent), so total leading silence
         // before any audio is heard is exactly the lead-in.
         assert!(plan[1].iter().any(|&s| s != 0.0));
     }
 
+    /// The per-sentence lead-in is budgeted out of the existing trailing
+    /// silence, so `audio + tail silence + next lead-in + next audio` keeps
+    /// the same 100 ms inter-sentence pause while still cushioning the next
+    /// onset.
+    #[test]
+    fn sentence_gap_budget_is_preserved() {
+        let mut first = true;
+        let silence_buf_len = 2400;
+        let first_plan = build_sentence_append_plan(&mut first, vec![0.5; 100], silence_buf_len);
+        let second_plan = build_sentence_append_plan(&mut first, vec![0.5; 100], silence_buf_len);
+
+        assert_eq!(
+            first_plan[2].len(),
+            silence_buf_len - SENTENCE_LEAD_IN_SAMPLES
+        );
+        assert_eq!(second_plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
+        assert_eq!(
+            first_plan[2].len() + second_plan[0].len(),
+            silence_buf_len,
+            "post-speech tail plus next lead-in should preserve total sentence gap"
+        );
+    }
+
     // ── normalize_for_playback tests ──────────────────────────────────────────
 
     /// A moderately quiet buffer (peak comfortably under MAX_GAIN reach of

From f00421cd7991626bf47c8b3c7dace7e83778eb2e Mon Sep 17 00:00:00 2001
From: Tyler Longwell <tlongwell@squareup.com>
Date: Mon, 18 May 2026 14:06:06 -0400
Subject: [PATCH 10/10] huddle(tts): keep sentence cushion in one source

Signed-off-by: Tyler Longwell <tlongwell@squareup.com>
---
 desktop/src-tauri/src/huddle/tts.rs | 162 +++++++++++++---------------
 1 file changed, 74 insertions(+), 88 deletions(-)

diff --git a/desktop/src-tauri/src/huddle/tts.rs b/desktop/src-tauri/src/huddle/tts.rs
index fd57c18b0..5a7f06c64 100644
--- a/desktop/src-tauri/src/huddle/tts.rs
+++ b/desktop/src-tauri/src/huddle/tts.rs
@@ -417,17 +417,15 @@ fn tts_worker(
                     // 2026-05-18 "first little sound is missing" regression).
                     apply_fade_out(&mut boosted);
 
-                    // Decide what to append, including the short lead-in pad
-                    // before this sentence chunk. Centralised in
-                    // `build_sentence_append_plan` so the "each chunk gets a
-                    // device cushion" invariant is testable without a rodio
-                    // mock. See its docstring and regression tests below.
+                    // Build one contiguous buffer per synthesized sentence:
+                    // lead-in cushion + audio + trailing gap. Keeping this as
+                    // a single rodio source preserves the original queue/drain
+                    // semantics (one append per sentence) while still giving
+                    // every chunk a quiet device warm-up window.
                     let was_first = first_append;
-                    let plan =
-                        build_sentence_append_plan(&mut first_append, boosted, silence_buf.len());
-                    for buf in plan {
-                        player.append(SamplesBuffer::new(channels, rate, buf));
-                    }
+                    let buf =
+                        build_sentence_append_buffer(&mut first_append, boosted, silence_buf.len());
+                    player.append(SamplesBuffer::new(channels, rate, buf));
                     if was_first {
                         tts_active.store(true, Ordering::Release);
                     }
@@ -542,8 +540,8 @@ fn apply_fade_out(samples: &mut [f32]) {
     }
 }
 
-/// Build the ordered list of buffers to append to the rodio `Player` for one
-/// synthesised sentence.
+/// Build the single buffer appended to the rodio `Player` for one synthesised
+/// sentence.
 ///
 /// Every sentence chunk gets a short lead-in pad immediately before its audio.
 /// This matters for chunks that start with soft first phonemes (`I'm`, `I've`):
@@ -554,25 +552,30 @@ fn apply_fade_out(samples: &mut [f32]) {
 /// sentence N contributes 80 ms of post-speech silence and sentence N+1
 /// contributes the remaining 20 ms of pre-speech cushion.
 ///
+/// The lead-in, audio, and trailing silence are concatenated into one
+/// `SamplesBuffer` before appending. This keeps rodio's queue shape at one
+/// tracked source per synthesized sentence, avoiding source-boundary/drain
+/// regressions from enqueueing the lead-in, audio, and tail as separate sounds.
+///
 /// `first_append` is still accepted and flipped on the first call because the
 /// worker uses it to decide when actual playback has been queued and when it is
 /// safe to set `tts_active` for echo gating.
-fn build_sentence_append_plan(
+fn build_sentence_append_buffer(
     first_append: &mut bool,
     boosted: Vec<f32>,
     silence_buf_len: usize,
-) -> Vec<Vec<f32>> {
-    let mut plan = Vec::with_capacity(3);
-    plan.push(vec![0.0f32; SENTENCE_LEAD_IN_SAMPLES]);
+) -> Vec<f32> {
     if *first_append {
         *first_append = false;
     }
-    plan.push(boosted);
-    plan.push(vec![
-        0.0f32;
-        silence_buf_len.saturating_sub(SENTENCE_LEAD_IN_SAMPLES)
-    ]);
-    plan
+
+    let trailing_silence_len = silence_buf_len.saturating_sub(SENTENCE_LEAD_IN_SAMPLES);
+    let mut buf =
+        Vec::with_capacity(SENTENCE_LEAD_IN_SAMPLES + boosted.len() + trailing_silence_len);
+    buf.extend(std::iter::repeat_n(0.0_f32, SENTENCE_LEAD_IN_SAMPLES));
+    buf.extend(boosted);
+    buf.extend(std::iter::repeat_n(0.0_f32, trailing_silence_len));
+    buf
 }
 
 // drain_until_shutdown lives in super (huddle/mod.rs) — shared with stt.rs.
@@ -1179,113 +1182,96 @@ mod tests {
         assert_eq!(SENTENCE_LEAD_IN_SAMPLES, 480, "20 ms × 24 kHz");
     }
 
-    // ── build_sentence_append_plan tests ──────────────────────────────────────
+    // ── build_sentence_append_buffer tests ───────────────────────────────────
 
-    /// REGRESSION (Tyler, 2026-05-18): a response can contain later sentence
-    /// chunks that begin with soft first phonemes like `I'm` / `I've`. The
-    /// sacrificial-prefix trimmer returns those chunks with speech at sample 0,
-    /// so every appended sentence needs its own playback cushion, not just the
-    /// first sentence of the response.
+    /// REGRESSION: every chunk needs an onset cushion; trimmed short chunks
+    /// can start with speech at sample 0.
     #[test]
-    fn lead_in_pad_fires_for_every_sentence_chunk() {
+    fn lead_in_pad_is_present_for_every_sentence_chunk() {
         const SENTENCE_AUDIO_LEN: usize = 1000;
         const SILENCE_BUF_LEN: usize = 2400; // 100 ms at 24 kHz, like production
         const N_SENTENCES: usize = 5;
 
         let mut first = true;
-        let mut total_lead_in_buffers = 0;
-        let mut total_audio_buffers = 0;
-        let mut total_inter_silence_buffers = 0;
 
         for _ in 0..N_SENTENCES {
-            let plan = build_sentence_append_plan(
+            let buf = build_sentence_append_buffer(
                 &mut first,
                 vec![0.5_f32; SENTENCE_AUDIO_LEN],
                 SILENCE_BUF_LEN,
             );
 
-            assert_eq!(
-                plan.len(),
-                3,
-                "every sentence emits [lead-in, audio, inter-silence]"
-            );
-            assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
+            assert_eq!(buf.len(), SENTENCE_AUDIO_LEN + SILENCE_BUF_LEN);
             assert!(
-                plan[0].iter().all(|&s| s == 0.0),
+                buf[..SENTENCE_LEAD_IN_SAMPLES].iter().all(|&s| s == 0.0),
                 "lead-in pad must be pure silence"
             );
-            assert_eq!(plan[1].len(), SENTENCE_AUDIO_LEN);
-            assert_eq!(
-                plan[2].len(),
-                SILENCE_BUF_LEN - SENTENCE_LEAD_IN_SAMPLES,
-                "trailing silence is shortened so lead-in + tail keeps the existing sentence gap"
+            assert!(
+                buf[SENTENCE_LEAD_IN_SAMPLES..SENTENCE_LEAD_IN_SAMPLES + SENTENCE_AUDIO_LEN]
+                    .iter()
+                    .all(|&s| s == 0.5),
+                "sentence audio must immediately follow the lead-in"
+            );
+            assert!(
+                buf[SENTENCE_LEAD_IN_SAMPLES + SENTENCE_AUDIO_LEN..]
+                    .iter()
+                    .all(|&s| s == 0.0),
+                "trailing gap must be pure silence"
             );
-            total_lead_in_buffers += 1;
-            total_audio_buffers += 1;
-            total_inter_silence_buffers += 1;
         }
 
-        assert_eq!(
-            total_lead_in_buffers, N_SENTENCES,
-            "lead-in must fire for every sentence chunk, not {} times",
-            total_lead_in_buffers
-        );
-        assert_eq!(total_audio_buffers, N_SENTENCES);
-        assert_eq!(total_inter_silence_buffers, N_SENTENCES);
         assert!(!first, "first_append flag must be cleared after first call");
     }
 
-    /// The plan flips `first_append` from true → false on the very first
-    /// call so the worker still knows exactly when it has queued the first
-    /// playable audio and can set `tts_active` for echo gating.
+    /// `first_append` still flips on the first call for `tts_active` gating.
     #[test]
-    fn build_sentence_append_plan_flips_first_append() {
+    fn build_sentence_append_buffer_flips_first_append() {
         let mut first = true;
-        let _ = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
+        let _ = build_sentence_append_buffer(&mut first, vec![0.5; 100], 2400);
         assert!(!first, "first call must flip the flag");
 
         // Subsequent call: still has a per-sentence lead-in, flag stays false.
-        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 24);
-        assert_eq!(plan.len(), 3);
-        assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
+        let buf = build_sentence_append_buffer(&mut first, vec![0.5; 100], 2400);
+        assert!(buf[..SENTENCE_LEAD_IN_SAMPLES].iter().all(|&s| s == 0.0));
         assert!(!first);
     }
 
-    /// Total leading silence on the first sentence is *exactly* the lead-in
-    /// pad — it does NOT double-count the inter-sentence silence at the
-    /// start of the plan. Inter-sentence silence is emitted *after* audio,
-    /// never before. (Max's [12] concern.)
+    /// Leading silence is exactly the lead-in; no pre-audio gap is double-counted.
     #[test]
     fn first_sentence_leading_silence_is_exactly_lead_in() {
         let mut first = true;
-        let plan = build_sentence_append_plan(&mut first, vec![0.5; 100], 2400);
-        // First buffer is lead-in pad, exactly SENTENCE_LEAD_IN_SAMPLES.
-        assert_eq!(plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
-        // Second buffer is the audio (non-silent), so total leading silence
-        // before any audio is heard is exactly the lead-in.
-        assert!(plan[1].iter().any(|&s| s != 0.0));
+        let buf = build_sentence_append_buffer(&mut first, vec![0.5; 100], 2400);
+        assert!(buf[..SENTENCE_LEAD_IN_SAMPLES].iter().all(|&s| s == 0.0));
+        assert_eq!(buf[SENTENCE_LEAD_IN_SAMPLES], 0.5);
     }
 
-    /// The per-sentence lead-in is budgeted out of the existing trailing
-    /// silence, so `audio + tail silence + next lead-in + next audio` keeps
-    /// the same 100 ms inter-sentence pause while still cushioning the next
-    /// onset.
+    /// Tail silence plus the next lead-in preserves the 100 ms sentence gap.
     #[test]
     fn sentence_gap_budget_is_preserved() {
         let mut first = true;
         let silence_buf_len = 2400;
-        let first_plan = build_sentence_append_plan(&mut first, vec![0.5; 100], silence_buf_len);
-        let second_plan = build_sentence_append_plan(&mut first, vec![0.5; 100], silence_buf_len);
+        let first_buf = build_sentence_append_buffer(&mut first, vec![0.5; 100], silence_buf_len);
+        let second_buf = build_sentence_append_buffer(&mut first, vec![0.5; 100], silence_buf_len);
+
+        let first_tail = &first_buf[SENTENCE_LEAD_IN_SAMPLES + 100..];
+        let second_lead = &second_buf[..SENTENCE_LEAD_IN_SAMPLES];
+        assert_eq!(first_tail.len(), silence_buf_len - SENTENCE_LEAD_IN_SAMPLES);
+        assert_eq!(second_lead.len(), SENTENCE_LEAD_IN_SAMPLES);
+        assert_eq!(first_tail.len() + second_lead.len(), silence_buf_len);
+    }
 
-        assert_eq!(
-            first_plan[2].len(),
-            silence_buf_len - SENTENCE_LEAD_IN_SAMPLES
-        );
-        assert_eq!(second_plan[0].len(), SENTENCE_LEAD_IN_SAMPLES);
-        assert_eq!(
-            first_plan[2].len() + second_plan[0].len(),
-            silence_buf_len,
-            "post-speech tail plus next lead-in should preserve total sentence gap"
+    /// Regression guard: one contiguous rodio source per synthesized sentence.
+    #[test]
+    fn sentence_append_buffer_is_one_contiguous_source() {
+        let mut first = true;
+        let buf = build_sentence_append_buffer(&mut first, vec![0.5; 100], 2400);
+
+        assert_eq!(buf.len(), 2400 + 100);
+        assert!(buf[..SENTENCE_LEAD_IN_SAMPLES].iter().all(|&s| s == 0.0));
+        assert!(
+            buf[SENTENCE_LEAD_IN_SAMPLES..SENTENCE_LEAD_IN_SAMPLES + 100]
+                .iter()
+                .all(|&s| s == 0.5)
         );
     }