diff --git a/README.md b/README.md
index d8cca13..c72eb94 100644
--- a/README.md
+++ b/README.md
@@ -37,14 +37,14 @@ matching chat template).
 | OLMoE 1B-7B Instruct (MoE) | [allenai](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF) | 4 GB | CPU | 21.6 | 55.7 | 64 experts / 8 active; per-channel QK-norm; `norm_topk_prob=false` |
 | OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | Vulkan `-g -1` | 18.9 | **121.2** | 16 layers all on VRAM; greedy on this prompt is unstable across backends — use `--temp 0.6 --top-p 0.95` for usable output |
 | OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | **CUDA** `-g -1` | **117.4** | **111.7** | Same; greedy varies, sampling coherent |
-| Qwen3-Coder 30B-A3B (MoE) | [Qwen](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF) | 17 GB | CPU | 15.1 | 21.2 | 128 experts / 8 active |
-| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | CPU `--tq` | 12.0 | 21.1 | 3-bit KV. FastScan K + V kernels (issue #34) keep attention cost bounded as context grows: **15.5 t/s decode @ 3.2K ctx** (27 % slowdown for ~27× context growth); without FastScan the per-block K+V path would drop this to ~13 t/s |
-| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `-g -1` (hybrid) | 1.0 | 5.8 | 29 GPU + 19 CPU layers, SLRU expert slot cache |
-| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `-g -1` (hybrid) | **13.9** | **22.7** | 29 GPU + 19 CPU layers (auto), ~2.2× Vulkan decode |
-| Llama-4 Scout 17B-16E (MoE) | [meta-llama](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | 61 GB | CPU | 1.9 | 3.9 | 48 layers, 17B active params; split GGUF (Q4_K_M) |
-| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `-g -1` (hybrid) | 0.9 | 2.1 | 7 GPU + 41 CPU layers — model dwarfs the 12 GB card, PCIe cost > GPU speedup so CPU-only wins here |
-| Qwen3.6-35B-A3B (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | 22 GB | CPU | 4.3 | 7.8 | hybrid GDN/attn, 256 experts / 8 active |
-| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1` (hybrid) | **11.2** | **23.8** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, batched-expert dispatch (8 experts × 3 ops into 2 Parallel.For sweeps), shared expert kept on GPU and overlapped with the CPU routed loop |
+| Qwen3-Coder 30B-A3B (MoE) | [Qwen](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF) | 17 GB | CPU | 13.3 | 21.1 | 128 experts / 8 active |
+| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | CPU `--tq` | 13.7 | 21.0 | 3-bit KV. FastScan K + V kernels (issue #34) keep attention cost bounded as context grows: **15.5 t/s decode @ 3.2K ctx** (27 % slowdown for ~27× context growth); without FastScan the per-block K+V path would drop this to ~13 t/s |
+| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `-g -1` (hybrid) | 1.1 | 5.3 | 29 GPU + 19 CPU layers, SLRU expert slot cache. Next-layer predictive prefetch (PR #77 / issue #50) on by default; no-op until the cache is under pressure — disable with `--no-moe-predict-prefetch` |
+| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `-g -1` (hybrid) | 10.6 | 22.2 | 29 GPU + 19 CPU layers; routed experts stream through `CudaExpertSlotManager` SLRU (2220 / 3712 slots) instead of the prior eager whole-layer upload (PR #77 / issue #72). Decode is at parity with the eager baseline (22.7 → 22.2 within noise); prefill drops 13.9 → 10.6 because the first-pass loads experts on demand. Cache budget is set to actually-fit-VRAM (counts the buffer pool's power-of-two round-up — earlier 3043-slot planning would have hit `cudaMalloc` failure once a working set filled past ~2200 unique experts). Set `SHARPI_EXPERT_STATS=path` to inspect per-layer hit rates |
+| Llama-4 Scout 17B-16E (MoE) | [meta-llama](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | 61 GB | CPU | 2.1 | 4.3 | 48 layers, 17B active params; split GGUF (Q4_K_M) |
+| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `-g -1` (hybrid) | 1.2 | 2.6 | 7 GPU + 41 CPU layers — model still dwarfs the 12 GB card so CPU-only wins, but per-expert SLRU streaming (PR #77 / issue #72) lifts decode 2.1 → 2.6 (+24 %) and prefill 0.9 → 1.2 (+33 %) over the prior eager whole-layer upload |
+| Qwen3.6-35B-A3B (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | 22 GB | CPU | 6.7 | 8.5 | hybrid GDN/attn, 256 experts / 8 active |
+| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1` (hybrid) | **14.7** | **23.2** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, batched-expert dispatch (8 experts × 3 ops into 2 Parallel.For sweeps), shared expert kept on GPU and overlapped with the CPU routed loop |
 | Qwen3.6-27B-MTP (GDN) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF) | 16 GB | CPU `--no-thinking` | 2.8 | **3.8** | dense 27B, hybrid GDN/attn, native MTP head; auto-engages MTP self-spec (issue #25) at greedy + `--no-thinking`. 95% draft acceptance (38/40); batched N=2 verify (#30) + fused Q6_K·Q8_K 2-input dot (#42) lift decode from 2.7 (sequential N=1) to 3.8 — 1.4× over MTP-off baseline |
 | Qwen3.6-27B-MTP (GDN) | (same) | 16 GB | **CUDA** `-g -1 --no-thinking` (hybrid) | **5.7** | **10.7** | 20/64 dense FFN layers on GPU (3.3 GB) + GDN + attn KV resident; 44/64 FFN layers on CPU mmap. 95% draft acceptance; batched verify lifts decode from 6.2 to 10.7 (1.73× over MTP-off baseline). The CPU FFN majority batches via `CpuDenseFfn2` and the on-GPU FFN layers now batch via `MatMulN2` (issue #43 — one weight read per row, two outputs). Direct-pinned `Download/UploadInto` (#48) and async `_lastHidden` overlap (#49) shave per-layer host stall on the MoE-MTP/dense-FFN-MTP hot path |
 | Qwen3.6-27B-MTP (GDN) | (same) | 19 GB | CPU `--no-thinking` `Q5_K_M` | 2.5 | **3.5** | Q5_K_M variant, ~10% slower than Q4_K_M as expected from weight bandwidth. 100% draft acceptance (40/40) on this prompt; batched verify lifts decode from 2.4 to 3.5 (1.46×) |
@@ -55,6 +55,13 @@ matching chat template).
 `--backend auto` (default) picks CUDA when available, sizing the GPU/CPU split from
 VRAM via TierPlanner; falls through to Vulkan only when CUDA isn't present.
 
+MoE expert-cache knobs (`--moe-warmpin`, `--moe-warmpin-after`,
+`--no-moe-predict-prefetch`, `--expert-stats`) are CLI-only on the `run`
+command; under `SharpInference.Server` the same behaviour is reachable
+via the env vars `SHARPI_MOE_WARMPIN`, `SHARPI_MOE_WARMPIN_AFTER`,
+`SHARPI_MOE_PREDICT_PREFETCH=0`, `SHARPI_EXPERT_STATS=<path>` set in the
+process environment before the server starts.
+
 SnapKV prefill-time KV eviction (issue #51) ships on every backend: CPU
 `ForwardPass` (#57), CUDA hybrid GDN `CudaHybridGdnForwardPass` (#58),
 dense CUDA `CudaForwardPass` (#63), and Vulkan `GpuForwardPass` (#64).
diff --git a/docs/moe-expert-offloading-research.md b/docs/moe-expert-offloading-research.md
new file mode 100644
index 0000000..928fdd5
--- /dev/null
+++ b/docs/moe-expert-offloading-research.md
@@ -0,0 +1,340 @@
+# MoE & Expert Offloading: State of the Art vs. SharpInference
+
+> Research note, 2026-05. Surveys the current literature on Mixture-of-Experts
+> (MoE) inference and expert offloading, audits what SharpInference does today
+> (with file references), and identifies concrete gaps and opportunities tailored
+> to our target deployment (single-user desktop, ~12 GB VRAM e.g. RTX 4070 Ti,
+> NativeAOT, GGUF).
+
+> **Status update (PR #77, 2026-05-29).** The sections below were written before
+> the implementation work in this PR landed. Reflecting what now ships, several
+> "open" items in §1 (TL;DR), §2.2 (CUDA non-GDN hybrid), §4 (gap analysis), and
+> §5 (recommendations) are now closed:
+>
+> - **§5 P0 — non-GDN CUDA per-expert SLRU.** `CudaHybridForwardPass` now streams
+>   routed experts through `CudaExpertSlotManager.GetOrLoad`. The eager
+>   `_gpuWGateExps`/`_gpuWUpExps`/`_gpuWDownExps` arrays referenced in §2.2 no
+>   longer exist (commit `2e48a29`).
+> - **§5 P1 (eviction half) — activation-aware caching.** `SlruCache` now picks
+>   victims by access frequency (`SlruCache.SelectProbationaryVictim`) and the
+>   slot managers support opt-in warm-pinning via `SHARPI_MOE_WARMPIN`
+>   (commit `b6b2763`).
+> - **§4 gap on predictive prefetch (Vulkan).** `ExpertRoutePredictor` records
+>   each layer's selection and prefetches the next GPU MoE layer's likely experts
+>   a layer ahead (commit `757976d`).
+> - **Cache sizing.** `MoeCacheSizing.Plan` is the routing-locality-aware sizer
+>   recommended in §5 P3 (commit `7b70c76`).
+>
+> Still open: the §5 P1 Q5_K-→-F32 dequant on the Vulkan `ExpertSlotManager`
+> upload path; the §5 P2 CPU-peer / KTransformers-AMX direction; the predictive
+> prefetch on the CUDA hybrid path (Vulkan-only today).
+
+---
+
+## 1. TL;DR
+
+SharpInference already has a **solid offloading skeleton** that maps onto several
+SOTA ideas — an SLRU expert cache, an async prefetcher, CPU-fallback compute on
+cache miss (the core "Fiddler" trick), and per-expert access profiling. That
+puts us ahead of naive "swap on demand" systems.
+
+The implementation is uneven across paths. The audit found:
+
+1. **Per-expert SLRU offloading exists on two of three hybrid paths, but not the
+   third.** The **GDN CUDA path** (`CudaHybridGdnForwardPass`, used for
+   qwen35moe) and the **Vulkan path** (`HybridForwardPass`) both stream experts
+   through the SLRU cache (`CudaExpertSlotManager.GetOrLoad` /
+   `ExpertSlotManager.TryGetCached`) with CPU-fallback compute on miss. But the
+   **non-GDN CUDA hybrid path** (`CudaHybridForwardPass`, used for Mixtral /
+   Qwen3-30B-A3B / Qwen3-Coder when they don't fit VRAM) does **whole-layer**
+   offload only: every expert of a GPU-tier layer is uploaded resident, and its
+   `_expertSlotManager`/`_prefetcher` fields are declared but never assigned (dead
+   code). So for the big non-GDN MoEs, a "GPU layer" must hold its *entire* expert
+   set in VRAM — there is no per-expert streaming, only the coarse CPU-layer /
+   GPU-layer split that `TierPlanner` decides.
+2. **Cached experts are quantized — except Q5_K in two spots.** Good news first:
+   experts are cached in native quant (Q4_K/Q6_K everywhere; Q5_K too on
+   `CudaExpertSlotManager`), so we are *not* generally paying an F32 premium. But
+   the **Vulkan `ExpertSlotManager`** and the non-GDN CUDA resident path
+   **dequantize Q5_K to F32** (`ExpertSlotManager.cs:156`,
+   `CudaHybridForwardPass.cs:1428`). qwen35moe stores `ffn_down_exps` as Q5_K, so
+   on Vulkan every cached down-projection expert is 4 B/element — 4× its source.
+   `CudaExpertSlotManager` already keeps Q5_K raw (`UploadRaw`); the other two
+   paths just need to mirror it.
+3. **Prefetching is reactive, not predictive.** The Vulkan path re-enqueues the
+   experts the router *just* selected, betting the next token reuses them at the
+   same layer (1-token, same-layer temporal locality). Every SOTA system instead
+   predicts the *next layer's* or *next token's* experts ahead of time. *(Already
+   tracked as issue #50 — pre-gated / PreScope-style predictive prefetch.)*
+4. **Caching is recency-only; the profiler is diagnostic-only.** The SLRU evicts
+   by recency. `ExpertAccessProfiler` tracks per-expert hit/miss and prints stats
+   (`CudaHybridGdnForwardPass` dump), but nothing feeds hotness back into eviction
+   priority or warm-pins hot experts at load. `TierPlanner` places layers by
+   footprint, not access frequency.
+
+The highest-leverage, lowest-risk wins for our use case are: **(a)** bring the
+non-GDN CUDA hybrid path to per-expert SLRU parity with the GDN path (so big
+non-GDN MoEs fit in less VRAM), **(b)** stop dequantizing Q5_K experts on the
+Vulkan/resident paths, **(c)** add next-layer expert *prediction* to drive the
+prefetcher (#50), **(d)** make eviction/placement activation-aware using the
+profiler we already built, and **(e)** a fast CPU expert GEMM (KTransformers-style,
+related to #54). Details and priorities in §5.
+
+---
+
+## 2. What SharpInference does today
+
+Verified against the source on this branch.
+
+### 2.1 MoE model support
+- Architecture/hparam detection in `src/SharpInference.Core/ModelGraph.cs` and
+  `ModelHyperparams.cs` (`IsMoE`, `NumExperts`, `NumActiveExperts`,
+  `ExpertIntermediateDim`, `HasSharedExpert`, `NormalizeMoeTopKWeights`,
+  `UseSigmoidGating`).
+- Covers Mixtral (top-2), Qwen3-MoE / qwen35moe (256 experts, top-8, shared
+  expert, GDN-hybrid), OLMoE (top-1), DeepSeek-V2 family, Llama4-style MoE.
+- GGUF stores experts as packed per-layer tensors
+  (`blk.{L}.ffn_{gate,up,down}_exps.weight`) plus optional shared-expert and
+  router (`ffn_gate_inp`) tensors; loaded zero-copy via mmap.
+
+### 2.2 Forward-pass MoE
+- **CPU** (`ForwardPass.cs`): router GEMV → softmax/sigmoid → `SelectTopK` →
+  optional shared expert → sparse routed experts via pointer-sliced mmap weights,
+  SIMD `MatVec`. Solid, correct, the reference path.
+- **Vulkan hybrid** (`HybridForwardPass.GpuMoeFfn`, ~line 1293): router on GPU,
+  top-k on CPU, then **per-selected-expert SLRU cache lookup**
+  (`_expertSlotManager.TryGetCached`). **Cache miss → compute that expert on the
+  CPU while the GPU is idle** (`GpuMoeFfnCpuFallback`), accumulate, upload, GPU
+  `AddInPlace`. This is the Fiddler idea (compute on CPU rather than block on a
+  transfer) and is genuinely good.
+- **CUDA GDN hybrid** (`CudaHybridGdnForwardPass.cs`, for qwen35moe): the most
+  developed path. Experts served by `CudaExpertSlotManager` SLRU
+  (`GetOrLoad`, line ~2257), keeps Q4_K/Q5_K/Q6_K quantized, has a CPU-MoE mode
+  (`SHARPI_CPU_MOE=1`), and dumps `ExpertAccessProfiler` stats on dispose.
+- **CUDA non-GDN hybrid** (`CudaHybridForwardPass.cs`, for Mixtral / Qwen3-MoE /
+  Qwen3-Coder too big for VRAM): GPU-tier layers upload **all** experts to VRAM as
+  `Tensor[][] _gpuWGateExps/...` (line ~297) and index them directly (line ~1348);
+  CPU-tier layers compute on CPU (`CpuMoeFfn`). Offload granularity is the whole
+  layer (`TierPlanner` split) — there is **no per-expert SLRU streaming** here. The
+  `_expertSlotManager`/`_prefetcher` fields (lines 108–109) are declared and
+  disposed but **never assigned** → the dynamic cache path is dead on this path.
+  Experts are kept in native quant for Q4_K/Q6_K (line ~1418) but Q5_K is
+  dequantized to F32 (line ~1428).
+
+### 2.3 Offloading infrastructure (`SharpInference.Pipeline` + Engine)
+- `SlruCache<K,V>` — segmented LRU, 25% probationary / 75% protected, evicts
+  probationary tail. `ExpertCache<T>` wraps it keyed by `(layer, expertId)`.
+- `ExpertSlotManager` / `CudaExpertSlotManager` — VRAM expert slot cache;
+  `TryGetCached`/`GetOrLoad`, `Preload`, eviction callback frees GPU tensors.
+  Keeps experts in native quant — **except** the Vulkan `ExpertSlotManager`
+  dequantizes Q5_K (and exotic dtypes) to F32 (`ExpertSlotManager.cs:156`), while
+  `CudaExpertSlotManager` keeps Q4_K/Q5_K/Q6_K all raw (`UploadRaw`, line ~162).
+- `MoEPrefetcher` — bounded channel + background worker calling
+  `slotManager.Preload`. Drops oldest when full. Wired **only** in the Vulkan
+  path, and only with `EnqueuePrefetch(layer, selectedExperts)` — i.e. the
+  experts already selected for the current layer/token.
+- `ExpertAccessProfiler` — per-`(layer,expert)` hit/miss counters, `OverallHitRate`,
+  `GetTopExperts`. Diagnostic only; not consumed by placement or eviction.
+- `TierPlanner` — greedy layer placement by **footprint** + KV budget. Not
+  access-aware.
+- `MemoryHierarchy` — 3-tier (VRAM → pinned RAM → NVMe) design; L3/NVMe +
+  io_uring is a stub (`NotImplementedYet`).
+
+---
+
+## 3. State of the art (2024–2026)
+
+Expert offloading exists because MoE activates only k-of-N experts per token, so
+most expert weights can live in slow memory (host RAM / SSD / CPU) and only the
+active few need to be in fast memory (VRAM). The whole game is **hiding the cost
+of getting the right experts into fast memory in time**, or avoiding the move
+entirely. The literature attacks this along six axes.
+
+### Axis A — Static placement / partitioning
+Decide once, offline, what lives where.
+- **KTransformers** (SOSP'25) — partition by *arithmetic intensity*: attention
+  and frequently-used experts on GPU, the rest computed on CPU with
+  highly-optimized kernels (AMX / AVX-512, llamafile-style sgemm). Reports
+  1.25–1.93× over llama.cpp, much more on quantized models; runs DeepSeek-R1/V3
+  (671B) on a single 24 GB GPU + big DRAM. Key lesson: **CPU expert compute is a
+  first-class path, not just a fallback** — with good kernels you don't move the
+  weights at all.
+- **llama.cpp** `--cpu-moe` / `--n-cpu-moe` / `-ot "exps=CPU"` — the practical
+  baseline: keep attention + shared experts (always active) on GPU, routed
+  experts on CPU. This is essentially what our CUDA path does statically.
+- **Local Routing Consistency** ("Not All Models Suit Expert Offloading", 2505.16056)
+  — *which* models even benefit from caching. Metrics SRP and SCH over 20 MoE
+  LLMs: models that put MoE on **every** layer and use **no shared expert** have
+  the highest locality (best cache hit rates); shared experts and dense-then-MoE
+  layouts hurt locality. Most models do well with a cache ≈ **2× the active
+  expert count**. Directly relevant to choosing cache sizes per model.
+
+### Axis B — Caching policy
+What to keep resident and what to evict.
+- LRU/LFU/SLRU (what we have) vs. **activation-aware** caches.
+- **MoE-Infinity** (2401.14361) — sequence-level activation *tracing* to capture
+  temporal locality, then prioritize caching experts by predicted activation
+  ratio; 4–20× latency reduction vs. baselines.
+- **HybriMoE** (2504.05897) — *score-based* caching + dynamic intra-layer
+  CPU/GPU scheduling, built on KTransformers; handles expert-activation
+  instability across tokens.
+
+### Axis C — Predictive prefetching (the big one)
+Move experts *before* you need them, overlapping I/O with compute.
+- **Pre-gated MoE** (ISCA'24) — add a "pre-gate" so layer L computes layer L+1's
+  expert selection, giving a full layer of prefetch lead time. Algorithm+system
+  co-design.
+- **Cross-Layer Gate / "Fate"** (2502.12224) — predict future-layer experts from
+  *current* layer's gate inputs; offloading system with prefetch + caching +
+  quantization, tuned for edge/memory budgets.
+- **ProMoE** (2410.22134) — proactive caching that predicts and preloads expert
+  usage to cut cache misses, separating prefill/decode behavior.
+- **AdapMoE** (2408.10284), **fMoE** (2502.05370), **ExpertFlow** (2410.17954) —
+  sensitivity-based gating, fine-grained prefetch+cache, and predictive
+  routing-path offload with token reordering (up to 93% VRAM savings, 2–10×).
+
+### Axis D — Speculation-driven offloading
+Use a draft/speculative process to predict experts many tokens ahead.
+- **MoE-SpeQ** (2511.14102) — small on-device draft model predicts the *sequence*
+  of experts for future tokens; a runtime orchestrator prefetches them from host
+  memory to overlap I/O with compute. Introduces an "Amortization Roofline Model"
+  to tune the speculation window for throughput.
+- **SP-MoE** (2510.10302), **MoE-SpAc** (2603.09983) — speculative decoding +
+  prefetch co-design; speculation doubles as a memory-management signal.
+- **OD-MoE** reports up to **99.94%** expert-activation prediction with shadow
+  networks; single-layer lookahead alone gives ~84–91%.
+
+### Axis E — Mixed-precision / compression of experts
+Don't pay full precision for cold experts.
+- **HOBBIT** (2411.01433) — mixed-precision expert offloading: load *less
+  important* experts at lower precision (cheaper transfer + less VRAM), critical
+  experts at full precision; token-/layer-/sequence-level prefetch + caching.
+  Built on llama.cpp; significant speedups with negligible quality loss. **Most
+  directly relevant to our F32-cache problem.**
+- **PreMoe** (2505.17639) — probabilistic expert *pruning* + task-adaptive
+  retrieval to fit big MoEs in constrained memory.
+
+### Axis F — Cache-miss tolerance & batching
+- **BuddyMoE** (2511.10054) — on a cache miss, substitute a *redundant/similar*
+  expert already resident rather than stalling, exploiting expert redundancy.
+- **ExpertFlow** token reordering / expert buffering ("Towards MoE Deployment",
+  2303.06182) — reorder tokens so a batch activates fewer distinct experts.
+  (Most relevant once we have continuous batching for MoE, which we don't yet.)
+
+---
+
+## 4. Gap analysis
+
+| SOTA capability | SharpInference today | Gap |
+|---|---|---|
+| Per-expert offloading across all backends | Vulkan + CUDA-GDN: per-expert SLRU. CUDA non-GDN: whole-layer offload, all experts resident. | Non-GDN CUDA MoE (Mixtral/Qwen3-30B-A3B/Coder) can't stream experts → must fit a layer's full expert set in VRAM. |
+| Mixed-precision / quantized cache (HOBBIT) | Experts cached in native quant — **except Q5_K → F32** on Vulkan SLRU + non-GDN CUDA resident path | Q5_K (qwen35moe `ffn_down_exps`) costs 4 B/elem on Vulkan. No *down*-quantization of cold experts (true HOBBIT). |
+| Activation-aware caching (MoE-Infinity, HybriMoE) | SLRU (recency only) + diagnostic-only `ExpertAccessProfiler` | Profiler doesn't drive eviction/placement or warm-pin hot experts. SLRU ≠ frequency-aware. |
+| **Predictive prefetch** (Pre-gated, Cross-Layer Gate, ProMoE) | Reactive 1-token, **same-layer** re-enqueue (Vulkan) | No next-layer/next-token prediction. *Tracked: #50.* |
+| Cache-miss CPU compute (Fiddler) | CPU-fallback compute on Vulkan + (via `SHARPI_CPU_MOE`) GDN paths | *Per-dispatch* CPU/GPU decision tracked as #54. |
+| Speculative expert prefetch (MoE-SpeQ, SP-MoE) | speculative *decoding* (MTP) exists, not used for expert prediction | Not started; natural extension of #50 once draft accepts are available. |
+| Fast CPU expert GEMM (KTransformers AMX) | SIMD `MatVec` (GEMV) CPU fallback | No AMX / blocked sgemm; CPU treated as fallback, not a peer compute tier (related to #54). |
+| Cache-miss substitution (BuddyMoE) | block on CPU compute instead | Fine for single-user; no redundancy reuse. |
+| Model-aware cache sizing (Local Routing Consistency) | fixed slot capacity | No per-model locality measurement; could size cache ≈2× active experts, skip caching for low-locality models. |
+| L3 NVMe tier / io_uring | designed, stubbed | Not shipped; only matters for models > host RAM. |
+| Batched MoE + token reorder (ExpertFlow) | MoE batching disabled | Out of scope for single-user target. |
+
+---
+
+## 5. Recommendations (prioritized for our target)
+
+Ordered by leverage/effort for the single-user desktop, ~12 GB VRAM, GGUF case.
+Most of these reuse infrastructure we already have.
+
+### P0 — Bring the non-GDN CUDA hybrid path to per-expert SLRU parity
+Wire `CudaExpertSlotManager` + `MoEPrefetcher` + CPU-fallback into
+`CudaHybridForwardPass` (the fields are already there, just unassigned) so it
+streams experts per-token like the GDN path (`CudaHybridGdnForwardPass`) and the
+Vulkan path — instead of forcing every expert of a GPU-tier layer to be resident.
+Today a non-GDN MoE bigger than VRAM (Mixtral, Qwen3-30B-A3B, Qwen3-Coder-30B) can
+only offload at whole-layer granularity, wasting VRAM on cold experts in the
+GPU-tier layers. The GDN path is the reference implementation to mirror.
+*Risk:* medium — but the exact pattern already exists in-repo to copy.
+
+### P1 — Stop dequantizing Q5_K experts (Vulkan + non-GDN CUDA resident path)
+`CudaExpertSlotManager` already keeps Q5_K raw via `UploadRaw`; mirror that in the
+Vulkan `ExpertSlotManager` (`ExpertSlotManager.cs:156`) and the
+`CudaHybridForwardPass` resident upload (`:1428`). qwen35moe's `ffn_down_exps` is
+Q5_K, so this 4×'s the cached down-proj footprint on Vulkan today. Cheap, local,
+high-certainty.
+*Risk:* low — Q5_K dequant-in-matmul kernels already exist.
+
+### P1 — Predictive prefetch (Cross-Layer Gate / Pre-gated) — *issue #50*
+Already filed. Replace the same-layer 1-token re-enqueue with **next-layer
+prediction**: run the *next* MoE layer's router on the *current* hidden state (an
+`embDim×numExperts` GEMV — tiny) to prefetch a layer ahead. ~84–91% accuracy for
+single-layer lookahead is plenty to convert blocking misses into overlapped loads.
+Biggest latency win in the offloaded regime; purely a prefetch hint.
+
+### P1 — Feed the profiler into eviction & warm-pinning
+We built `ExpertAccessProfiler` but only print it. Use it two ways: **(1)** at
+load, warm the cache / pin the top-N experts per layer (KTransformers/MoE-Infinity
+"hot experts on GPU"); **(2)** bias SLRU so high-frequency experts resist eviction
+(frequency-aware, not pure recency).
+*Risk:* low.
+
+### P2 — Treat CPU as a compute peer (KTransformers-style) — *relates to #54*
+Add a blocked/multi-threaded expert GEMM (and explore AVX-512/AMX where available)
+so CPU-resident experts are computed cheaply rather than something we always try
+to avoid by moving to GPU. Pairs naturally with the CPU-fallback dispatch policy
+in #54 — it makes "compute on CPU" the *intended* steady state for cold experts,
+à la KTransformers, rather than a stall.
+*Risk:* medium (kernel work), high payoff for big-model offload.
+
+### P2 — Model-aware cache policy (Local Routing Consistency)
+Measure/lookup per-model routing locality and size the expert cache accordingly
+(≈2× active experts is a good default), and detect low-locality models (shared
+expert + dense-then-MoE) where caching helps little — pin shared experts, don't
+over-invest cache slots on routed ones.
+*Risk:* low.
+
+### P3 — Speculative expert prefetch (MoE-SpeQ) — *extends #50*
+We already have speculative decoding (MTP). Reuse its drafted tokens to prefetch
+the experts those tokens will need, several tokens ahead — a natural extension of
+the #50 predictor. Big win only in the heavily-offloaded regime; defer until the
+P0/P1 items land.
+
+### Out of scope for now
+L3/NVMe tier (only matters when model > host RAM), continuous-batching MoE +
+token reordering (single-user target), GDN GPU kernels (orthogonal to offloading).
+
+---
+
+## 6. Bottom line
+
+We're not missing the *concept* of expert offloading — the bones are good, the
+GDN CUDA path is genuinely SOTA-aligned (per-expert SLRU, CPU fallback, quantized
+cache, profiling), and predictive prefetch (#50) and Fiddler dispatch (#54) are
+already on the board. What's missing is *parity and follow-through*: the non-GDN
+CUDA path still does coarse whole-layer offload, Q5_K experts get needlessly
+expanded to F32 on two paths, and the profiler we built doesn't yet steer caching.
+Those are mostly wiring over infrastructure we already have, and they're what
+stands between "MoE that fits in VRAM" and "MoE that's bigger than VRAM but still
+fast."
+
+---
+
+## 7. References
+
+- KTransformers — CPU/GPU Hybrid Inference for MoE (SOSP'25): https://dl.acm.org/doi/10.1145/3731569.3764843
+- llama.cpp MoE offload guide (`-ot`/`--cpu-moe`): https://huggingface.co/blog/Doctor-Shotgun/llamacpp-moe-offload-guide
+- Not All Models Suit Expert Offloading (Local Routing Consistency): https://hf.co/papers/2505.16056
+- MoE-Infinity — Activation-Aware Expert Offloading: https://hf.co/papers/2401.14361
+- HybriMoE — Hybrid CPU-GPU Scheduling & Cache: https://hf.co/papers/2504.05897
+- Pre-gated MoE (ISCA'24): https://www.microsoft.com/en-us/research/wp-content/uploads/2024/05/isca24_pregated_moe_camera_ready.pdf
+- Cross-Layer Gate / Fate — Accurate Expert Predictions: https://hf.co/papers/2502.12224
+- ProMoE — Proactive Caching: https://hf.co/papers/2410.22134
+- AdapMoE — Sensitivity-based Gating/Management: https://hf.co/papers/2408.10284
+- fMoE — Fine-Grained Expert Offloading: https://hf.co/papers/2502.05370
+- ExpertFlow — Predictive Routing + Token Scheduling: https://hf.co/papers/2410.17954
+- HOBBIT — Mixed-Precision Expert Offloading: https://hf.co/papers/2411.01433
+- PreMoe — Expert Pruning & Retrieval: https://hf.co/papers/2505.17639
+- MoE-SpeQ — Speculative Quantized Decoding + Prefetch: https://arxiv.org/abs/2511.14102
+- SP-MoE — Speculative Decoding & Prefetching: https://arxiv.org/pdf/2510.10302
+- BuddyMoE — Expert Redundancy for Cache Misses: https://arxiv.org/html/2511.10054v1
+- Towards MoE Deployment (Expert Buffering / token reorder): https://hf.co/papers/2303.06182
diff --git a/scripts/bench-moe-rerun.ps1 b/scripts/bench-moe-rerun.ps1
new file mode 100644
index 0000000..db666b2
--- /dev/null
+++ b/scripts/bench-moe-rerun.ps1
@@ -0,0 +1,90 @@
+param(
+    [int]$NTokens = 80,
+    [int]$Repeats = 3,
+    [string]$Prompt = "Write a Python function that sorts a list using the quicksort algorithm:",
+    [string]$OutJson = "tools\bench\moe-rerun.json"
+)
+
+# Same model paths as scripts/bench-all.ps1
+$moe = "models\Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf"
+
+$cases = @(
+    @{ Tag = "moe-cpu";           Extra = @();                                   Timeout = 360 },
+    @{ Tag = "moe-cpu-tq";        Extra = @("--tq");                             Timeout = 360 },
+    @{ Tag = "moe-vulkan-hybrid"; Extra = @("-g","-1","--backend","vulkan");     Timeout = 600 },
+    @{ Tag = "moe-cuda-hybrid";   Extra = @("-g","-1","--backend","cuda");       Timeout = 600 }
+)
+
+$all = @()
+foreach ($case in $cases) {
+    for ($i = 1; $i -le $Repeats; $i++) {
+        $runTag = "$($case.Tag)-r$i"
+        Write-Host ""
+        Write-Host "=== $runTag ($($i)/$Repeats) ===" -ForegroundColor Yellow
+        $r = .\scripts\bench-textgen.ps1 -Model $moe -Tag $runTag -NTokens $NTokens -Prompt $Prompt `
+                -TimeoutSec $case.Timeout -ExtraArgs $case.Extra
+        $r | Add-Member -NotePropertyName Group -NotePropertyValue $case.Tag -Force
+        $r | Add-Member -NotePropertyName Run   -NotePropertyValue $i        -Force
+        $all += $r
+        Write-Host ("  prefill={0} t/s  decode={1} t/s  wall={2}s" -f $r.PrefillTps, $r.DecodeTps, $r.WallSec)
+    }
+}
+
+# Aggregate per (group, metric)
+function Stats($values) {
+    if ($values.Count -eq 0) { return @{ Mean = 0.0; Std = 0.0 } }
+    $mean = ($values | Measure-Object -Average).Average
+    if ($values.Count -lt 2) { return @{ Mean = [Math]::Round($mean,2); Std = 0.0 } }
+    $sumsq = 0.0
+    foreach ($v in $values) { $sumsq += ($v - $mean) * ($v - $mean) }
+    $std = [Math]::Sqrt($sumsq / ($values.Count - 1))
+    return @{ Mean = [Math]::Round($mean,2); Std = [Math]::Round($std,3) }
+}
+
+$baselines = @{
+    "moe-cpu"           = @{ Prefill = 14.9; Decode = 21.4 }
+    "moe-cpu-tq"        = @{ Prefill = 12.4; Decode = 21.4 }
+    "moe-vulkan-hybrid" = @{ Prefill = 1.0;  Decode = 5.5  }
+    "moe-cuda-hybrid"   = @{ Prefill = 16.1; Decode = 22.4 }
+}
+
+$summary = @()
+foreach ($case in $cases) {
+    $rows = $all | Where-Object { $_.Group -eq $case.Tag }
+    $pf = Stats ($rows | ForEach-Object { [double]$_.PrefillTps })
+    $dc = Stats ($rows | ForEach-Object { [double]$_.DecodeTps })
+    $bp = $baselines[$case.Tag].Prefill
+    $bd = $baselines[$case.Tag].Decode
+    # Z-scores: (mean - baseline) / std.  Negative => below baseline.
+    $zp = if ($pf.Std -gt 0) { [Math]::Round(($pf.Mean - $bp) / $pf.Std, 2) } else { 0.0 }
+    $zd = if ($dc.Std -gt 0) { [Math]::Round(($dc.Mean - $bd) / $dc.Std, 2) } else { 0.0 }
+    $summary += [PSCustomObject]@{
+        Tag           = $case.Tag
+        BasePf        = $bp
+        BaseDc        = $bd
+        PfMean        = $pf.Mean
+        PfStd         = $pf.Std
+        PfZ           = $zp
+        PfDeltaPct    = [Math]::Round(100.0 * ($pf.Mean - $bp) / $bp, 1)
+        DcMean        = $dc.Mean
+        DcStd         = $dc.Std
+        DcZ           = $zd
+        DcDeltaPct    = [Math]::Round(100.0 * ($dc.Mean - $bd) / $bd, 1)
+    }
+}
+
+Write-Host ""
+Write-Host "=== Per-run rates ===" -ForegroundColor Cyan
+$all | Format-Table Group, Run, PrefillTps, DecodeTps, WallSec, TimedOut -AutoSize
+
+Write-Host ""
+Write-Host "=== Mean ± stddev vs baseline ===" -ForegroundColor Cyan
+$summary | Format-Table Tag, BasePf, PfMean, PfStd, PfZ, PfDeltaPct, BaseDc, DcMean, DcStd, DcZ, DcDeltaPct -AutoSize
+
+$payload = [PSCustomObject]@{
+    Runs    = $all
+    Summary = $summary
+}
+$payload | ConvertTo-Json -Depth 5 | Set-Content $OutJson
+Write-Host ""
+Write-Host "Results written to $OutJson" -ForegroundColor DarkGray
diff --git a/src/SharpInference.Cli/RunCommand.cs b/src/SharpInference.Cli/RunCommand.cs
index 8b8fa6b..a055d26 100644
--- a/src/SharpInference.Cli/RunCommand.cs
+++ b/src/SharpInference.Cli/RunCommand.cs
@@ -148,6 +148,28 @@ public sealed class Settings : CommandSettings
         [Description("Maximum reasoning tokens before forcing </think>. 0 = unlimited (default). Not honored on the speculative-decode path.")]
         [DefaultValue(0)]
         public int MaxThinkingTokens { get; init; }
+
+        // ── MoE expert-cache tuning (offloaded MoE models) ──
+        // Good defaults are automatic: frequency-aware SLRU eviction, VRAM-sized cache,
+        // and next-layer predictive prefetch are all ON without any flag. These knobs only
+        // tune/disable that behaviour. Each is also settable via the named env var.
+        [CommandOption("--no-moe-predict-prefetch")]
+        [Description("MoE: disable next-layer predictive expert prefetch (Vulkan; on by default). Env: SHARPI_MOE_PREDICT_PREFETCH=0.")]
+        [DefaultValue(false)]
+        public bool NoMoePredictPrefetch { get; init; }
+
+        [CommandOption("--moe-warmpin")]
+        [Description("MoE: also pin the top-N hottest experts per layer into the GPU cache after warmup (default 0 = off; frequency-aware eviction already retains hot experts). Env: SHARPI_MOE_WARMPIN.")]
+        public int? MoeWarmPin { get; init; }
+
+        [CommandOption("--moe-warmpin-after")]
+        [Description("MoE: expert accesses to observe before warm-pinning selects the hot set (default 512). Only used with --moe-warmpin. Env: SHARPI_MOE_WARMPIN_AFTER.")]
+        [DefaultValue(0L)]
+        public long MoeWarmPinAfter { get; init; }
+
+        [CommandOption("--expert-stats")]
+        [Description("MoE: write GPU expert-cache (SLRU) hit-rate stats to this file on exit. Env: SHARPI_EXPERT_STATS.")]
+        public string? ExpertStatsPath { get; init; }
     }
 
     protected override int Execute(CommandContext context, Settings settings, CancellationToken cancellation)
@@ -155,6 +177,19 @@ protected override int Execute(CommandContext context, Settings settings, Cancel
         if (settings.MinBatchBlas > 0)
             SimdKernels.MinBatchForBlas = settings.MinBatchBlas;
 
+        // MoE expert-cache knobs are read from the environment inside the engine
+        // (WarmPinConfig / HybridForwardPass / slot-manager dispose). Surface them as
+        // CLI flags by setting the env var here — before any forward pass is built —
+        // so an explicit flag overrides, and env-only use still works.
+        if (settings.MoeWarmPin is int warmPin)  // explicitly passed (incl. 0 to force off)
+            Environment.SetEnvironmentVariable("SHARPI_MOE_WARMPIN", warmPin.ToString());
+        if (settings.MoeWarmPinAfter > 0)
+            Environment.SetEnvironmentVariable("SHARPI_MOE_WARMPIN_AFTER", settings.MoeWarmPinAfter.ToString());
+        if (settings.NoMoePredictPrefetch)
+            Environment.SetEnvironmentVariable("SHARPI_MOE_PREDICT_PREFETCH", "0");
+        if (!string.IsNullOrEmpty(settings.ExpertStatsPath))
+            Environment.SetEnvironmentVariable("SHARPI_EXPERT_STATS", settings.ExpertStatsPath);
+
         var modelPath = settings.ModelPath;
         if (modelPath is null)
         {
@@ -275,8 +310,8 @@ protected override int Execute(CommandContext context, Settings settings, Cancel
                 case "":
                     // Auto: pick CUDA when available. CudaForwardPass handles full-offload
                     // (dense + MoE); CudaHybridForwardPass handles partial-offload (dense or
-                    // MoE with eager per-layer expert loading). TQ on CUDA requires
-                    // head_dim ∈ {128, 256}.
+                    // MoE; routed experts stream through the CudaExpertSlotManager SLRU).
+                    // TQ on CUDA requires head_dim ∈ {128, 256}.
                     bool tqHeadDimOk = hp.HeadDim is 128 or 256;
                     wantCuda = (!settings.TurboQuant || tqHeadDimOk)
                         && CudaBackend.IsAvailable();
diff --git a/src/SharpInference.Cuda/CudaBackend.cs b/src/SharpInference.Cuda/CudaBackend.cs
index b2187d8..b738e28 100644
--- a/src/SharpInference.Cuda/CudaBackend.cs
+++ b/src/SharpInference.Cuda/CudaBackend.cs
@@ -16,6 +16,16 @@ namespace SharpInference.Cuda;
 /// </summary>
 public sealed unsafe class CudaBackend : IComputeBackend, IImageOpsBackend, IDisposable
 {
+    /// <summary>
+    /// Round <paramref name="byteSize"/> up to the bucket size the buffer pool will
+    /// actually allocate (next power-of-two, min 64 bytes). Use this when sizing
+    /// budgets that share VRAM with pooled allocations — the pool's round-up can
+    /// inflate per-allocation footprint up to ~2× and a budget computed from raw
+    /// byte sizes will overshoot real capacity. Bypasses the pool entirely when
+    /// callers pass <c>exact: true</c> to <see cref="Upload"/> / <see cref="UploadRaw"/>.
+    /// </summary>
+    public static nuint RoundUpAllocBytes(nuint byteSize) => GpuBufferPool.RoundUp(byteSize);
+
     private readonly nint _handle;
     private readonly SgemmPrecision _precision;
     private readonly int _smVersion;
diff --git a/src/SharpInference.Engine/CudaExpertSlotManager.cs b/src/SharpInference.Engine/CudaExpertSlotManager.cs
index 1719777..66e29a1 100644
--- a/src/SharpInference.Engine/CudaExpertSlotManager.cs
+++ b/src/SharpInference.Engine/CudaExpertSlotManager.cs
@@ -33,6 +33,12 @@ public sealed class CudaExpertSlotManager : IDisposable
     private readonly object _lock = new();
     private bool _disposed;
 
+    // Opt-in warm-pinning of hot experts (SHARPI_MOE_WARMPIN=N). Disabled by default.
+    private readonly int _warmPinPerLayer;
+    private readonly long _warmPinAfter;
+    private readonly int _pinBudget;
+    private bool _warmed;
+
     public ExpertAccessProfiler Profiler => _profiler;
 
     /// <param name="gpu">CUDA backend to allocate/free GPU tensors on.</param>
@@ -55,7 +61,13 @@ public CudaExpertSlotManager(CudaBackend gpu, GgufModel model, ModelHyperparams
         _hp = hp;
         _dtypes = dtypes;
         _profiler = new ExpertAccessProfiler(hp.NumLayers, hp.NumExperts);
-        _cache = new ExpertCache<ExpertCudaSlot>(slotCapacity, EvictSlot);
+        // Frequency-aware eviction: under MoE routing skew, the least-accessed
+        // probationary expert is a better victim than the strict LRU tail.
+        _cache = new ExpertCache<ExpertCudaSlot>(slotCapacity, EvictSlot,
+            frequencyOf: _profiler.GetAccessCount);
+        _warmPinPerLayer = WarmPinConfig.PerLayer;
+        _warmPinAfter = WarmPinConfig.AfterAccesses;
+        _pinBudget = Math.Max(1, slotCapacity / 2); // never pin more than half the cache
     }
 
     /// <summary>
@@ -86,10 +98,41 @@ public ExpertCudaSlot GetOrLoad(int layer, int expertId)
             _profiler.RecordMiss(layer, expertId);
             slot = UploadExpert(layer, expertId);
             _cache.Put(layer, expertId, slot);
+            MaybeWarmPin();
             return slot;
         }
     }
 
+    /// <summary>
+    /// Once enough routing history has accumulated, pin the hottest currently-resident
+    /// experts (top <c>SHARPI_MOE_WARMPIN</c> per layer) into the protected segment so
+    /// they are never evicted. No-op unless warm-pinning is enabled. Runs once, under
+    /// the caller's lock. Layers are visited in descending hotness so a tight pin
+    /// budget protects the layers that route most often, not whatever happens to sit
+    /// at low indices (matters for hybrid GDN+MoE models where MoE FFN sits at high
+    /// layer indices).
+    /// </summary>
+    private void MaybeWarmPin()
+    {
+        if (_warmed || _warmPinPerLayer <= 0) return;
+        if (_profiler.TotalHits + _profiler.TotalMisses < _warmPinAfter) return;
+        _warmed = true;
+        var layerOrder = new int[_hp.NumLayers];
+        for (int l = 0; l < _hp.NumLayers; l++) layerOrder[l] = l;
+        Array.Sort(layerOrder, (a, b) => _profiler.GetLayerAccessCount(b).CompareTo(_profiler.GetLayerAccessCount(a)));
+        int pinned = 0;
+        foreach (int layer in layerOrder)
+        {
+            if (pinned >= _pinBudget) break;
+            if (_profiler.GetLayerAccessCount(layer) == 0) break;
+            foreach (int e in _profiler.GetTopExperts(layer, _warmPinPerLayer))
+            {
+                if (pinned >= _pinBudget) break;
+                if (_cache.Contains(layer, e)) { _cache.Pin(layer, e); pinned++; }
+            }
+        }
+    }
+
     /// <summary>
     /// Pre-load the given expert into the cache if not already present.
     ///
diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs
index f81a273..6bb4d2b 100644
--- a/src/SharpInference.Engine/CudaHybridForwardPass.cs
+++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs
@@ -37,12 +37,7 @@ public sealed unsafe class CudaHybridForwardPass : IForwardPass
     private readonly Tensor[] _gpuAttnNorm, _gpuWq, _gpuWk, _gpuWv, _gpuWo;
     private readonly Tensor[] _gpuFfnNorm, _gpuWGate, _gpuWUp, _gpuWDown;
     private readonly Tensor[]? _gpuWGateInp, _gpuWGateShexp, _gpuWUpShexp, _gpuWDownShexp;
-    // Eager per-expert weights for CUDA GPU layers — every expert is VRAM-resident,
-    // so the per-token MoE FFN is a straight indexed lookup. Different from the Vulkan
-    // hybrid's lazy SLRU slot cache: simpler, but the model must fit in VRAM after
-    // accounting for KV cache + scratch. Sized [nGpuLayers][numExperts]; null when
-    // not MoE or there are no GPU layers.
-    private readonly Tensor[][]? _gpuWGateExps, _gpuWUpExps, _gpuWDownExps;
+    // Attention bias tensors for GPU layers (null when the model has no attention bias).
     private readonly Tensor[]? _gpuBq, _gpuBk, _gpuBv, _gpuBo;
     private readonly Tensor[]? _gpuQNorm, _gpuKNorm;
     private readonly Tensor[] _gpuKCache, _gpuVCache;
@@ -99,18 +94,14 @@ public sealed unsafe class CudaHybridForwardPass : IForwardPass
     private readonly float* _ropeSinTable;
     private readonly int _ropeHalfDim;
 
-    // ── Expert slot cache (for MoE GPU layers with lazy/evictable expert loading) ──
-    // These fields stay declared for symmetry with HybridForwardPass — CUDA hybrid
-    // currently refuses MoE+GPU at construction time, so the MoE GPU dispatch below
-    // is unreachable and these fields are always null. Pragma-suppressed so the
-    // unused-readonly check doesn't elevate to error under TreatWarningsAsErrors.
-#pragma warning disable CS0649
-    private readonly ExpertSlotManager? _expertSlotManager;
-    private readonly MoEPrefetcher? _prefetcher;
-    private readonly Tensor? _gpuFallbackContrib;
-    private readonly Tensor? _gpuPinnedNorm;
-#pragma warning restore CS0649
-    // CUDA hybrid has no CPU expert fallback (every expert is VRAM-resident).
+    // ── Expert slot cache (MoE GPU layers, lazy/evictable expert loading) ──
+    // Routed experts for GPU-tier MoE layers are streamed through this SLRU cache
+    // (mirror of CudaHybridGdnForwardPass), rather than every expert being uploaded
+    // resident. This lets non-GDN MoE models (Mixtral, Qwen3-30B-A3B, Qwen3-Coder)
+    // run with more layers on the GPU than the full expert footprint would allow.
+    // Null when not MoE or there are no GPU layers. Loads are synchronous on miss
+    // (no prefetcher): the GDN path established this is fast enough for k=8 decode.
+    private readonly CudaExpertSlotManager? _expertSlotManager;
 
     public int MaxSeqLen => _maxSeqLen;
     public LayerPlacement Placement => _placement;
@@ -239,9 +230,6 @@ void TraceVram(string label)
         _gpuFfnNorm = new Tensor[_nGpuLayers];
         _gpuWGate = new Tensor[_nGpuLayers]; _gpuWUp = new Tensor[_nGpuLayers]; _gpuWDown = new Tensor[_nGpuLayers];
         _gpuWGateInp = _isMoE ? new Tensor[_nGpuLayers] : null;
-        _gpuWGateExps = _isMoE ? new Tensor[_nGpuLayers][] : null;
-        _gpuWUpExps   = _isMoE ? new Tensor[_nGpuLayers][] : null;
-        _gpuWDownExps = _isMoE ? new Tensor[_nGpuLayers][] : null;
         _gpuWGateShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null;
         _gpuWUpShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null;
         _gpuWDownShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null;
@@ -291,12 +279,9 @@ void TraceVram(string label)
             if (_isMoE)
             {
                 _gpuWGateInp![i]  = UploadWeight($"blk.{i}.ffn_gate_inp.weight");
-                // Eager per-expert upload — every expert is VRAM-resident. Required because
-                // CUDA hybrid does not yet ship the SLRU slot cache. The TierPlanner is
-                // expected to size nGpuLayers so the total expert footprint fits.
-                _gpuWGateExps![i] = UploadExpertWeights($"blk.{i}.ffn_gate_exps.weight", _expertDim, _embDim,    hp.NumExperts);
-                _gpuWUpExps![i]   = UploadExpertWeights($"blk.{i}.ffn_up_exps.weight",   _expertDim, _embDim,    hp.NumExperts);
-                _gpuWDownExps![i] = UploadExpertWeights($"blk.{i}.ffn_down_exps.weight", _embDim,    _expertDim, hp.NumExperts);
+                // Routed experts are NOT uploaded here — they stream through the
+                // CudaExpertSlotManager SLRU cache (created after this loop). The router
+                // and shared expert stay resident since they run on every token.
                 if (_hasSharedExpert)
                 {
                     _gpuWGateShexp![i] = UploadWeight($"blk.{i}.ffn_gate_shexp.weight");
@@ -342,12 +327,53 @@ void TraceVram(string label)
         Console.Error.WriteLine(" done.");
         TraceVram("after all weight uploads");
 
-        // MoE GPU layers use eager expert loading (all experts VRAM-resident).
-        // No slot manager / SLRU cache — keep it simple at the cost of slightly more
-        // VRAM per GPU layer. TierPlanner sizes nGpuLayers so the total expert footprint
-        // fits the remaining VRAM budget.
+        // MoE GPU layers stream routed experts through an SLRU cache. Size it from
+        // the *actual* free VRAM remaining now that attention weights, KV cache and
+        // scratch are uploaded (cudaMemGetInfo via FreeVramBytes), capped at the full
+        // GPU-layer expert count. Capping at totalGpuExperts means the cache can never
+        // hold more than the old eager path did — which TierPlanner already verified
+        // fits — so there is no new OOM risk; the budget term only *shrinks* capacity
+        // when VRAM is tight (e.g. the user forced extra GPU layers via -g), enabling
+        // streaming instead of an OOM.
         if (_isMoE && _nGpuLayers > 0)
-            Console.Error.WriteLine($"[CudaHybridForwardPass] MoE eager expert load: {hp.NumExperts} experts × {_nGpuLayers} GPU layers (gate+up+down).");
+        {
+            long perExpert = PerExpertBytes();
+            long reserve = 512L << 20; // 512 MiB headroom for transient per-GEMM scratch
+            long free = (long)gpu.FreeVramBytes;
+            var plan = MoeCacheSizing.Plan(_nGpuLayers, hp.NumExperts, hp.NumActiveExperts,
+                free, perExpert, reserve);
+            int totalGpuExperts = _nGpuLayers * hp.NumExperts;
+            Console.Error.WriteLine(
+                $"[CudaHybridForwardPass] SLRU expert cache: {plan.Slots} slots / {totalGpuExperts} total " +
+                $"({hp.NumExperts} experts × {_nGpuLayers} GPU layers, per-expert ≈ {perExpert / 1024} KiB, " +
+                $"free VRAM ≈ {free / (1024 * 1024)} MiB).");
+            switch (plan.Status)
+            {
+                case MoeCacheSizingStatus.BudgetExhausted:
+                    // Budget couldn't fit even one expert; capacity was clamped to 1.
+                    // Decode will thrash (~every routed expert misses); louder than the
+                    // BelowRecommended warning because the perf hit is catastrophic.
+                    Console.Error.WriteLine(
+                        "[CudaHybridForwardPass] WARNING: free VRAM cannot fit a single expert; " +
+                        "cache clamped to 1 slot. Every routed expert will miss and stream from CPU. " +
+                        "Reduce -g or use --backend vulkan.");
+                    break;
+                case MoeCacheSizingStatus.UnknownExpertSize:
+                    Console.Error.WriteLine(
+                        "[CudaHybridForwardPass] WARNING: could not measure per-expert size " +
+                        "(missing blk.0.ffn_*_exps tensor); cache fell back to total. " +
+                        "Will fail at runtime if total VRAM is exceeded.");
+                    break;
+                case MoeCacheSizingStatus.BelowRecommended:
+                    int pct = plan.RecommendedSlots > 0 ? plan.Slots * 100 / plan.RecommendedSlots : 0;
+                    Console.Error.WriteLine(
+                        $"[CudaHybridForwardPass] WARNING: cache ({plan.Slots}) is {pct}% of the " +
+                        $"routing-locality recommendation (~{plan.RecommendedSlots} = 2× active per layer); " +
+                        $"expert hit rate may suffer. Fewer GPU layers (-g) or more VRAM would help.");
+                    break;
+            }
+            _expertSlotManager = new CudaExpertSlotManager(gpu, model, hp, plan.Slots, _gpuWeightDTypes);
+        }
 
         // ── Resolve CPU weights (layers nGpuLayers..numLayers-1) ──
         _cpuHidden = Alloc(_embDim);
@@ -1311,10 +1337,9 @@ private void GpuDenseFfn(int layer)
 
     private void GpuMoeFfn(int layer)
     {
-        // Eager-loaded variant of GpuForwardPass.GpuMoeFfn / CudaForwardPass.MoeFfn:
-        // every expert is VRAM-resident, so the per-token MoE FFN is a straight
-        // indexed lookup over (_gpuWGateExps/_gpuWUpExps/_gpuWDownExps)[layer][expertIdx].
-        // No slot manager, no CPU fallback, no host-coherent pinned norm copy — CUDA's
+        // SLRU-streamed variant (mirror of CudaHybridGdnForwardPass.GpuMoeFfn): each
+        // selected routed expert is fetched via _expertSlotManager.GetOrLoad, which
+        // returns a cached slot or synchronously uploads-then-caches on miss. CUDA's
         // implicit stream ordering removes the explicit barrier vocabulary the Vulkan
         // version needs.
         int numActive = _hp.NumActiveExperts;
@@ -1344,9 +1369,10 @@ private void GpuMoeFfn(int layer)
         {
             int expertIdx = selectedExperts[i];
             float expertWeight = expertWeights[i];
+            var slot = _expertSlotManager!.GetOrLoad(layer, expertIdx);
 
-            GpuMatMul(_gpuFfnGate, _gpuWGateExps![layer][expertIdx], _gpuNormBuf);
-            GpuMatMul(_gpuFfnUp,   _gpuWUpExps![layer][expertIdx],   _gpuNormBuf);
+            GpuMatMul(_gpuFfnGate, slot.Gate, _gpuNormBuf);
+            GpuMatMul(_gpuFfnUp,   slot.Up,   _gpuNormBuf);
 
             if (_hp.UseSigmoidGating)
             {
@@ -1355,7 +1381,7 @@ private void GpuMoeFfn(int layer)
             }
 
             _gpu.SiLuMul(_gpuFfnGate, _gpuFfnUp);
-            GpuMatMul(_gpuMoeExpertOut!, _gpuWDownExps![layer][expertIdx], _gpuFfnGate);
+            GpuMatMul(_gpuMoeExpertOut!, slot.Down, _gpuFfnGate);
 
             if (_hp.UseSigmoidGating)
                 _gpu.AddInPlace(_gpuHidden, _gpuMoeExpertOut!);
@@ -1367,16 +1393,30 @@ private void GpuMoeFfn(int layer)
             _gpu.AddInPlace(_gpuHidden, _gpuMoeSharedOut!);
     }
 
-    // GpuMoeFfnCpuFallback removed for CUDA hybrid: every expert is eagerly resident
-    // on the GPU, so there's no slot-cache miss to spill to CPU. ExpertMatVec is still
-    // used by CpuMoeFfn for the all-CPU layer slice.
+    // Routed-expert weights are uploaded lazily by CudaExpertSlotManager on cache
+    // miss, not eagerly here. (The shared expert and router stay resident.)
 
-    private Tensor[] UploadExpertWeights(string name, int rows, int cols, int expertCount)
+    /// <summary>
+    /// On-VRAM bytes for one expert's three weight tensors (gate + up + down), used to
+    /// size the SLRU slot capacity. Mirrors CudaExpertSlotManager's upload accounting:
+    /// Q4_K/Q5_K/Q6_K are stored raw; other dtypes expand to F32. Each tensor's raw
+    /// byte size is rounded up to the buffer pool's allocation bucket (power-of-two,
+    /// min 64 B) — otherwise the planner over-estimates capacity by ~2× since pooled
+    /// allocations inflate sub-bucket sizes (e.g. 1.05 MiB Q5_K tensor → 2 MiB).
+    /// </summary>
+    private long PerExpertBytes()
     {
-        var tensors = new Tensor[expertCount];
-        for (int expertIdx = 0; expertIdx < expertCount; expertIdx++)
-            tensors[expertIdx] = UploadExpertWeight(name, rows, cols, expertIdx);
-        return tensors;
+        long Bytes(string name, int rows, int cols)
+        {
+            if (_model.FindTensor(name) is not { } info) return 0;
+            long raw = info.DType is DType.Q4_K or DType.Q5_K or DType.Q6_K
+                ? (long)rows * (cols / DTypeInfo.BlockSize(info.DType)) * DTypeInfo.BytesPerBlock(info.DType)
+                : (long)rows * cols * sizeof(float); // F32 (native or dequantized)
+            return (long)CudaBackend.RoundUpAllocBytes((nuint)raw);
+        }
+        return Bytes("blk.0.ffn_gate_exps.weight", _expertDim, _embDim)
+             + Bytes("blk.0.ffn_up_exps.weight",   _expertDim, _embDim)
+             + Bytes("blk.0.ffn_down_exps.weight", _embDim,    _expertDim);
     }
 
     private Tensor UploadTqSignPatterns(int layerIndex)
@@ -1391,48 +1431,6 @@ private Tensor UploadTqSignPatterns(int layerIndex)
         return _gpu.Upload(fullSigns, TensorShape.D1(fullSigns.Length));
     }
 
-    private Tensor UploadExpertWeight(string name, int rows, int cols, int expertIdx)
-    {
-        var info = _model.FindTensor(name)
-            ?? throw new InvalidOperationException($"Missing tensor: {name}");
-        var data = _model.GetTensorData(info);
-
-        // exact=true on every branch: expert weights are session-lifetime. Pool
-        // round-up across (NumExperts × NumLayers) FFN tensors is the dominant
-        // VRAM-waste source on MoE models — reclaiming it widens the SLRU slot count.
-        if (info.DType == DType.Float32)
-        {
-            int elemOffset = expertIdx * rows * cols;
-            var floats = MemoryMarshal.Cast<byte, float>(data).Slice(elemOffset, rows * cols);
-            var result = _gpu.Upload(floats, TensorShape.D1(floats.Length), exact: true);
-            _gpuWeightDTypes[result.Handle] = DType.Float32;
-            return result;
-        }
-
-        int bytesPerRow = (cols / DTypeInfo.BlockSize(info.DType))
-                        * DTypeInfo.BytesPerBlock(info.DType);
-        int expertBytes = rows * bytesPerRow;
-        int byteOffset = expertIdx * expertBytes;
-        var expertData = data.Slice(byteOffset, expertBytes);
-
-        if (info.DType == DType.Q4_K || info.DType == DType.Q6_K)
-        {
-            int floatCount = expertData.Length / 4;
-            var rawFloats = new float[floatCount];
-            expertData.CopyTo(MemoryMarshal.AsBytes(rawFloats.AsSpan()));
-            var result = _gpu.Upload(rawFloats, TensorShape.D1(floatCount), exact: true);
-            _gpuWeightDTypes[result.Handle] = info.DType;
-            return result;
-        }
-
-        int count = rows * cols;
-        var f32 = new float[count];
-        Dequantize.ToFloat32(expertData, f32, info.DType, count);
-        var tensor = _gpu.Upload(f32, TensorShape.D1(count), exact: true);
-        _gpuWeightDTypes[tensor.Handle] = DType.Float32;
-        return tensor;
-    }
-
     private static void SelectTopK(ReadOnlySpan<float> logits, int k,
         Span<int> indices, Span<float> weights, bool normalize)
     {
@@ -1535,13 +1533,7 @@ public void Dispose()
             if (_isMoE)
             {
                 _gpu.Free(_gpuWGateInp![i]);
-                // Eager per-expert tensors: free every slot since there's no slot manager here.
-                for (int e = 0; e < _hp.NumExperts; e++)
-                {
-                    _gpu.Free(_gpuWGateExps![i][e]);
-                    _gpu.Free(_gpuWUpExps![i][e]);
-                    _gpu.Free(_gpuWDownExps![i][e]);
-                }
+                // Routed-expert tensors are owned by _expertSlotManager (freed in its Dispose).
                 if (_hasSharedExpert)
                 {
                     _gpu.Free(_gpuWGateShexp![i]);
@@ -1595,9 +1587,26 @@ public void Dispose()
         if (_cpuDecompBuf != null) NativeMemory.Free(_cpuDecompBuf);
         _cpuKvCache.Dispose();
         _cpuTqKvCache?.Dispose();
-        _prefetcher?.Dispose();
-        _expertSlotManager?.Dispose();
-        if (_gpuFallbackContrib is not null) _gpu.Free(_gpuFallbackContrib);
-        if (_gpuPinnedNorm is not null) _gpu.Free(_gpuPinnedNorm);
+        if (_expertSlotManager is not null)
+        {
+            // SHARPI_EXPERT_STATS=<path>: dump SLRU hit rate + top experts per layer
+            // (parity with CudaHybridGdnForwardPass).
+            var statsPath = Environment.GetEnvironmentVariable("SHARPI_EXPERT_STATS");
+            if (!string.IsNullOrEmpty(statsPath))
+            {
+                // Diagnostic-only: a write failure must never skip the slot manager's
+                // Dispose below (which frees GPU tensors), so swallow + log.
+                try
+                {
+                    using var w = new StreamWriter(statsPath);
+                    _expertSlotManager.Profiler.PrintStats(w);
+                }
+                catch (Exception ex)
+                {
+                    Console.Error.WriteLine($"[CudaHybridForwardPass] Failed to write expert stats to {statsPath}: {ex.Message}");
+                }
+            }
+            _expertSlotManager.Dispose();
+        }
     }
 }
diff --git a/src/SharpInference.Engine/ExpertSlotManager.cs b/src/SharpInference.Engine/ExpertSlotManager.cs
index 05fe2db..038d601 100644
--- a/src/SharpInference.Engine/ExpertSlotManager.cs
+++ b/src/SharpInference.Engine/ExpertSlotManager.cs
@@ -24,6 +24,12 @@ public sealed class ExpertSlotManager : IDisposable
     private readonly object _lock = new();
     private bool _disposed;
 
+    // Opt-in warm-pinning of hot experts (SHARPI_MOE_WARMPIN=N). Disabled by default.
+    private readonly int _warmPinPerLayer;
+    private readonly long _warmPinAfter;
+    private readonly int _pinBudget;
+    private bool _warmed;
+
     public ExpertAccessProfiler Profiler => _profiler;
 
     /// <param name="gpu">Vulkan backend to allocate/free GPU tensors on.</param>
@@ -44,18 +50,32 @@ public ExpertSlotManager(VulkanBackend gpu, GgufModel model, ModelHyperparams hp
         _hp = hp;
         _dtypes = dtypes;
         _profiler = new ExpertAccessProfiler(hp.NumLayers, hp.NumExperts);
-        _cache = new ExpertCache<ExpertGpuSlot>(slotCapacity, EvictSlot);
+        // Frequency-aware eviction: under MoE routing skew, the least-accessed
+        // probationary expert is a better victim than the strict LRU tail.
+        _cache = new ExpertCache<ExpertGpuSlot>(slotCapacity, EvictSlot,
+            frequencyOf: _profiler.GetAccessCount);
+        _warmPinPerLayer = WarmPinConfig.PerLayer;
+        _warmPinAfter = WarmPinConfig.AfterAccesses;
+        _pinBudget = Math.Max(1, slotCapacity / 2); // never pin more than half the cache
     }
 
     /// <summary>
     /// Return the GPU tensors for the given expert only if they are already cached.
     /// Does NOT load from disk on miss — use <see cref="GetOrLoad"/> for that.
+    /// Records the lookup outcome on the profiler so frequency-aware eviction +
+    /// warm-pinning have real data on the Vulkan path (where the forward pass
+    /// never calls <see cref="GetOrLoad"/>).
     /// Thread-safe.
     /// </summary>
     public bool TryGetCached(int layer, int expertId, out ExpertGpuSlot slot)
     {
         lock (_lock)
-            return _cache.TryGet(layer, expertId, out slot);
+        {
+            bool hit = _cache.TryGet(layer, expertId, out slot);
+            if (hit) _profiler.RecordHit(layer, expertId);
+            else _profiler.RecordMiss(layer, expertId);
+            return hit;
+        }
     }
 
     /// <summary>
@@ -75,14 +95,48 @@ public ExpertGpuSlot GetOrLoad(int layer, int expertId)
             _profiler.RecordMiss(layer, expertId);
             slot = UploadExpert(layer, expertId);
             _cache.Put(layer, expertId, slot);
+            MaybeWarmPin();
             return slot;
         }
     }
 
+    /// <summary>
+    /// Once enough routing history has accumulated, pin the hottest currently-resident
+    /// experts (top <c>SHARPI_MOE_WARMPIN</c> per layer) into the protected segment so
+    /// they are never evicted. No-op unless warm-pinning is enabled. Runs once, under
+    /// the caller's lock. Layers are visited in descending hotness so a tight pin
+    /// budget protects the layers that route most often, not whatever happens to sit
+    /// at low indices (matters for hybrid GDN+MoE models where MoE FFN sits at high
+    /// layer indices).
+    /// </summary>
+    private void MaybeWarmPin()
+    {
+        if (_warmed || _warmPinPerLayer <= 0) return;
+        if (_profiler.TotalHits + _profiler.TotalMisses < _warmPinAfter) return;
+        _warmed = true;
+        var layerOrder = new int[_hp.NumLayers];
+        for (int l = 0; l < _hp.NumLayers; l++) layerOrder[l] = l;
+        Array.Sort(layerOrder, (a, b) => _profiler.GetLayerAccessCount(b).CompareTo(_profiler.GetLayerAccessCount(a)));
+        int pinned = 0;
+        foreach (int layer in layerOrder)
+        {
+            if (pinned >= _pinBudget) break;
+            // Cold layers contribute nothing; once the sort hits zero we can stop.
+            if (_profiler.GetLayerAccessCount(layer) == 0) break;
+            foreach (int e in _profiler.GetTopExperts(layer, _warmPinPerLayer))
+            {
+                if (pinned >= _pinBudget) break;
+                if (_cache.Contains(layer, e)) { _cache.Pin(layer, e); pinned++; }
+            }
+        }
+    }
+
     /// <summary>
     /// Pre-load the given expert into the cache if not already present.
     /// Uses <see cref="VulkanBackend.UploadBackground"/> so it is safe to call
     /// from a background thread concurrently with the main recording session.
+    /// Also runs <see cref="MaybeWarmPin"/> so warm-pinning fires on the Vulkan
+    /// path (where <see cref="GetOrLoad"/> is never called by the forward pass).
     /// </summary>
     public void Preload(int layer, int expertId)
     {
@@ -93,6 +147,7 @@ public void Preload(int layer, int expertId)
                 var slot = UploadExpert(layer, expertId, background: true);
                 _cache.Put(layer, expertId, slot);
             }
+            MaybeWarmPin();
         }
     }
 
diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs
index 66d4cd3..afd618c 100644
--- a/src/SharpInference.Engine/HybridForwardPass.cs
+++ b/src/SharpInference.Engine/HybridForwardPass.cs
@@ -97,6 +97,30 @@ public sealed unsafe class HybridForwardPass : IForwardPass
     // ── Expert slot cache (for MoE GPU layers with lazy/evictable expert loading) ──
     private ExpertSlotManager? _expertSlotManager;
     private MoEPrefetcher? _prefetcher;
+    // Next-layer predictive prefetch: records each layer's expert selection and prefetches
+    // the next GPU MoE layer's likely experts a layer ahead. ON by default (it only makes
+    // the already-on background prefetch smarter, and is a no-op when experts aren't being
+    // evicted); disable with SHARPI_MOE_PREDICT_PREFETCH=0 (or --no-moe-predict-prefetch).
+    private readonly ExpertRoutePredictor? _routePredictor;
+    private readonly bool _predictPrefetch = ParsePredictPrefetchFlag();
+
+    private static bool ParsePredictPrefetchFlag()
+    {
+        var s = Environment.GetEnvironmentVariable("SHARPI_MOE_PREDICT_PREFETCH");
+        if (string.IsNullOrEmpty(s)) return true; // default on
+        switch (s.Trim().ToLowerInvariant())
+        {
+            case "0": case "false": case "off": case "no": case "disabled":
+                return false;
+            case "1": case "true": case "on": case "yes": case "enabled":
+                return true;
+            default:
+                Console.Error.WriteLine(
+                    $"[HybridForwardPass] SHARPI_MOE_PREDICT_PREFETCH='{s}' not recognized; defaulting to ON. " +
+                    "Accepted: 1/0, true/false, on/off, yes/no (case-insensitive).");
+                return true;
+        }
+    }
     // Pinned host-visible GPU tensor for uploading CPU fallback contributions to GPU hidden state.
     private Tensor? _gpuFallbackContrib;
     // Pinned host-visible GPU tensor for reading the norm buffer on CPU without a separate Download.
@@ -330,6 +354,8 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp
                 : totalExperts;
             _expertSlotManager = new ExpertSlotManager(gpu, model, hp, capacity, _gpuWeightDTypes);
             _prefetcher = new MoEPrefetcher(_expertSlotManager);
+            if (_predictPrefetch)
+                _routePredictor = new ExpertRoutePredictor(_nGpuLayers, hp.NumActiveExperts);
             _gpuFallbackContrib = gpu.AllocatePinned(TensorShape.D1(_embDim));
             _gpuPinnedNorm      = gpu.AllocatePinned(TensorShape.D1(_embDim));
             Console.Error.WriteLine($"[HybridForwardPass] MoE expert slot cache: {capacity} slots ({hp.NumExperts} experts × {_nGpuLayers} layers), SLRU lazy-load.");
@@ -613,6 +639,7 @@ public void ResetCache()
             _cpuTqKvCache.Reset();
         else
             _cpuKvCache.Reset();
+        _routePredictor?.Reset();
     }
 
     /// <inheritdoc/>
@@ -1321,8 +1348,10 @@ private void GpuMoeFfn(int layer)
         // is idle (between EndRecordAndSubmit and the next BeginRecord).
         // Their weighted outputs are accumulated in _cpuFallbackBuf and
         // uploaded to the pre-allocated pinned tensor for GPU AddInPlace.
-        // Prefetch the same experts for the next token (1-token lookahead).
-        _prefetcher?.EnqueuePrefetch(layer, selectedExperts);
+
+        // Look up current-layer experts FIRST. TryGetCached promotes hits from
+        // probationary to protected, so the slots needed for this token are
+        // safe from the prefetcher's next Preload eviction below.
         Span<bool> isGpu = stackalloc bool[numActive];
         // ExpertGpuSlot contains Tensor (managed reference type fields) — heap-allocate.
         ExpertGpuSlot[] cachedSlots = new ExpertGpuSlot[numActive];
@@ -1334,6 +1363,23 @@ private void GpuMoeFfn(int layer)
             if (!isGpu[i]) hasCpuFallback = true;
         }
 
+        // Now enqueue prefetches. Order matters: the worker thread may race ahead
+        // and evict probationary slots — by promoting current-layer slots first
+        // (above), a next-layer prefetch can't evict what this token needs.
+        // Same-layer 1-token-ahead prefetch (cheap, always on).
+        _prefetcher?.EnqueuePrefetch(layer, selectedExperts);
+
+        // ── Next-layer predictive prefetch (opt-in) ──
+        // Record this layer's selection, then prefetch the *next* GPU MoE layer's likely
+        // experts (its previous-token selection) so they load while this layer finishes —
+        // a full layer of lead time. Best-effort: wrong guesses only waste a transfer.
+        if (_routePredictor is not null)
+        {
+            _routePredictor.Record(layer, selectedExperts);
+            if (layer + 1 < _nGpuLayers && _routePredictor.TryPredict(layer + 1, out var nextExperts))
+                _prefetcher?.EnqueuePrefetch(layer + 1, nextExperts);
+        }
+
         if (hasCpuFallback)
         {
             // _gpuPinnedNorm was populated by the GPU session above — map it directly,
@@ -1671,7 +1717,26 @@ public void Dispose()
         _cpuKvCache.Dispose();
         _cpuTqKvCache?.Dispose();
         _prefetcher?.Dispose();
-        _expertSlotManager?.Dispose();
+        if (_expertSlotManager is not null)
+        {
+            // SHARPI_EXPERT_STATS=<path>: parity with the CUDA hybrid forward passes
+            // (CudaHybridForwardPass/CudaHybridGdnForwardPass) so the CLI flag works
+            // on every MoE backend, not just CUDA.
+            var statsPath = Environment.GetEnvironmentVariable("SHARPI_EXPERT_STATS");
+            if (!string.IsNullOrEmpty(statsPath))
+            {
+                try
+                {
+                    using var w = new StreamWriter(statsPath);
+                    _expertSlotManager.Profiler.PrintStats(w);
+                }
+                catch (Exception ex)
+                {
+                    Console.Error.WriteLine($"[HybridForwardPass] Failed to write expert stats to {statsPath}: {ex.Message}");
+                }
+            }
+            _expertSlotManager.Dispose();
+        }
         if (_gpuFallbackContrib is not null) _gpu.Free(_gpuFallbackContrib);
         if (_gpuPinnedNorm is not null) _gpu.Free(_gpuPinnedNorm);
     }
diff --git a/src/SharpInference.Engine/MoEPrefetcher.cs b/src/SharpInference.Engine/MoEPrefetcher.cs
index 1eb9f44..30b52c7 100644
--- a/src/SharpInference.Engine/MoEPrefetcher.cs
+++ b/src/SharpInference.Engine/MoEPrefetcher.cs
@@ -47,7 +47,19 @@ private async Task RunAsync()
                 foreach (int expertId in batch.ExpertIds)
                 {
                     if (_cts.Token.IsCancellationRequested) return;
-                    _slotManager.Preload(batch.Layer, expertId);
+                    try
+                    {
+                        _slotManager.Preload(batch.Layer, expertId);
+                    }
+                    catch (Exception ex) when (ex is not OperationCanceledException)
+                    {
+                        // A single Preload failure (transient cudaMalloc / upload
+                        // fault / stale predicted expert) must not silently kill
+                        // the worker: prefetch is best-effort, but losing it mid-run
+                        // degrades throughput with no log to grep for.
+                        Console.Error.WriteLine(
+                            $"[MoEPrefetcher] Preload(layer={batch.Layer}, expert={expertId}) failed: {ex.Message}");
+                    }
                 }
             }
         }
diff --git a/src/SharpInference.Engine/WarmPinConfig.cs b/src/SharpInference.Engine/WarmPinConfig.cs
new file mode 100644
index 0000000..6d36df1
--- /dev/null
+++ b/src/SharpInference.Engine/WarmPinConfig.cs
@@ -0,0 +1,49 @@
+namespace SharpInference.Engine;
+
+/// <summary>
+/// Reads the opt-in warm-pinning configuration from the environment, once.
+/// <list type="bullet">
+///   <item><c>SHARPI_MOE_WARMPIN</c> — number of hottest experts to pin <i>per layer</i>
+///     into the GPU expert cache's protected segment. <c>0</c> (default) disables
+///     warm-pinning entirely, so behaviour is unchanged unless opted in.</item>
+///   <item><c>SHARPI_MOE_WARMPIN_AFTER</c> — number of expert accesses to observe
+///     before the warm set is chosen (default 512), so pinning reflects real routing
+///     rather than the first few cold tokens. Must be &gt; 0 (any positive value);
+///     0 / negative / malformed are rejected with a stderr message and the default
+///     applies.</item>
+/// </list>
+/// Shared by <see cref="ExpertSlotManager"/> (Vulkan) and <see cref="CudaExpertSlotManager"/>.
+/// Malformed values fall back to the default <i>and</i> log a warning to stderr so
+/// typos don't silently turn the feature off.
+/// </summary>
+internal static class WarmPinConfig
+{
+    public static readonly int PerLayer = ParseInt("SHARPI_MOE_WARMPIN", 0, allowZero: true);
+    public static readonly long AfterAccesses = ParseLong("SHARPI_MOE_WARMPIN_AFTER", 512, allowZero: false);
+
+    private static int ParseInt(string name, int fallback, bool allowZero)
+    {
+        var s = Environment.GetEnvironmentVariable(name);
+        if (string.IsNullOrEmpty(s)) return fallback;
+        if (!int.TryParse(s, out int v) || v < 0 || (!allowZero && v == 0))
+        {
+            Console.Error.WriteLine(
+                $"[WarmPinConfig] {name}='{s}' is not a {(allowZero ? "non-negative" : "positive")} integer; using default {fallback}.");
+            return fallback;
+        }
+        return v;
+    }
+
+    private static long ParseLong(string name, long fallback, bool allowZero)
+    {
+        var s = Environment.GetEnvironmentVariable(name);
+        if (string.IsNullOrEmpty(s)) return fallback;
+        if (!long.TryParse(s, out long v) || v < 0 || (!allowZero && v == 0))
+        {
+            Console.Error.WriteLine(
+                $"[WarmPinConfig] {name}='{s}' is not a {(allowZero ? "non-negative" : "positive")} integer; using default {fallback}.");
+            return fallback;
+        }
+        return v;
+    }
+}
diff --git a/src/SharpInference.Pipeline/ExpertAccessProfiler.cs b/src/SharpInference.Pipeline/ExpertAccessProfiler.cs
index 9652b6e..68a7507 100644
--- a/src/SharpInference.Pipeline/ExpertAccessProfiler.cs
+++ b/src/SharpInference.Pipeline/ExpertAccessProfiler.cs
@@ -63,6 +63,34 @@ public double GetLayerHitRate(int layer)
         return (hits + misses) == 0 ? 0.0 : (double)hits / (hits + misses);
     }
 
+    /// <summary>
+    /// Total access count (hits + misses) for one expert. Used as the popularity
+    /// signal for frequency-aware cache eviction.
+    /// </summary>
+    public long GetAccessCount(int layer, int expertId)
+    {
+        int i = layer * _numExperts + expertId;
+        return Interlocked.Read(ref _hits[i]) + Interlocked.Read(ref _misses[i]);
+    }
+
+    /// <summary>
+    /// Total access count for a whole layer (sum across experts). Used to rank
+    /// layers by hotness so warm-pinning budgets the hottest layers first instead
+    /// of iterating in layer-index order (which biases pins to low-index layers
+    /// for hybrid GDN+MoE models that cluster MoE FFN at high indices).
+    /// </summary>
+    public long GetLayerAccessCount(int layer)
+    {
+        long total = 0;
+        int offset = layer * _numExperts;
+        for (int e = 0; e < _numExperts; e++)
+        {
+            total += Interlocked.Read(ref _hits[offset + e]);
+            total += Interlocked.Read(ref _misses[offset + e]);
+        }
+        return total;
+    }
+
     /// <summary>
     /// Returns the <paramref name="n"/> most-accessed expert IDs for <paramref name="layer"/>,
     /// sorted descending by total access count (hits + misses).
@@ -94,7 +122,22 @@ public void PrintStats(TextWriter output)
 
         for (int layer = 0; layer < _numLayers; layer++)
         {
-            double lr = GetLayerHitRate(layer);
+            long lh = 0, lm = 0;
+            int offset = layer * _numExperts;
+            for (int e = 0; e < _numExperts; e++)
+            {
+                lh += Interlocked.Read(ref _hits[offset + e]);
+                lm += Interlocked.Read(ref _misses[offset + e]);
+            }
+            // Layers that never report into the profiler (CPU-resident under hybrid
+            // offload, or non-MoE layers) have zero counts. Skip the bogus 0.0 % +
+            // arbitrary "top expert" output that would come from sorting equal zeros.
+            if (lh + lm == 0)
+            {
+                sb.AppendLine($"  layer {layer,3}: (no GPU SLRU accesses recorded)");
+                continue;
+            }
+            double lr = (double)lh / (lh + lm);
             var top = GetTopExperts(layer, 3);
             string topStr = string.Join(", ", top);
             sb.AppendLine($"  layer {layer,3}: hit {lr:P1}  top experts: [{topStr}]");
diff --git a/src/SharpInference.Pipeline/ExpertCache.cs b/src/SharpInference.Pipeline/ExpertCache.cs
index c76b998..f838a53 100644
--- a/src/SharpInference.Pipeline/ExpertCache.cs
+++ b/src/SharpInference.Pipeline/ExpertCache.cs
@@ -17,17 +17,26 @@ public sealed class ExpertCache<T> : IDisposable
     /// Optional callback invoked with the evicted value so the caller can release
     /// any associated GPU resources.
     /// </param>
-    public ExpertCache(int capacity, Action<T>? onEvict = null)
+    /// <param name="frequencyOf">
+    /// Optional per-(layer, expertId) access-count accessor. When supplied, eviction
+    /// becomes frequency-aware (least-accessed probationary expert is evicted first),
+    /// exploiting MoE routing skew. When null, eviction is plain LRU.
+    /// </param>
+    public ExpertCache(int capacity, Action<T>? onEvict = null,
+        Func<int, int, long>? frequencyOf = null)
     {
         if (capacity <= 0) throw new ArgumentOutOfRangeException(nameof(capacity));
         // Split: 25% probationary, 75% protected — biased toward retention since routing is skewed.
         int probCap = Math.Max(1, capacity / 4);
         int protCap = Math.Max(1, capacity - probCap);
-        _slru = new SlruCache<(int, int), T>(probCap, protCap);
+        Func<(int Layer, int ExpertId), long>? freq =
+            frequencyOf is null ? null : k => frequencyOf(k.Layer, k.ExpertId);
+        _slru = new SlruCache<(int, int), T>(probCap, protCap, freq);
         _onEvict = onEvict;
     }
 
     public int Count => _slru.Count;
+    public int PinnedCount => _slru.PinnedCount;
 
     /// <summary>Look up the cached value for (<paramref name="layer"/>, <paramref name="expertId"/>).</summary>
     public bool TryGet(int layer, int expertId, out T value) =>
@@ -47,6 +56,18 @@ public void Put(int layer, int expertId, T value)
     public bool Contains(int layer, int expertId) =>
         _slru.Contains((layer, expertId));
 
+    /// <summary>
+    /// Pin (<paramref name="layer"/>, <paramref name="expertId"/>) so it is never
+    /// evicted while resident. The expert must already be cached (call <see cref="Put"/>
+    /// first). Use for warm-pinning the hottest experts identified by profiling.
+    /// </summary>
+    public void Pin(int layer, int expertId) => _slru.Pin((layer, expertId));
+
+    /// <summary>Remove the pin on (<paramref name="layer"/>, <paramref name="expertId"/>).</summary>
+    public void Unpin(int layer, int expertId) => _slru.Unpin((layer, expertId));
+
+    public bool IsPinned(int layer, int expertId) => _slru.IsPinned((layer, expertId));
+
     /// <summary>
     /// Invoke <paramref name="action"/> for every currently-cached value, then clear the cache.
     /// Use this in dispose paths to release GPU resources without skipping the evict callback.
diff --git a/src/SharpInference.Pipeline/ExpertRoutePredictor.cs b/src/SharpInference.Pipeline/ExpertRoutePredictor.cs
new file mode 100644
index 0000000..ce1b45a
--- /dev/null
+++ b/src/SharpInference.Pipeline/ExpertRoutePredictor.cs
@@ -0,0 +1,63 @@
+namespace SharpInference.Pipeline;
+
+/// <summary>
+/// Predicts which experts an MoE layer will activate for the next token, using the
+/// previous token's selection at the same layer. MoE routing has strong cross-token
+/// temporal locality at a fixed layer (the same premise that makes the SLRU expert
+/// cache effective), so last-token selections are a cheap, training-free predictor
+/// — the PreScope / MoE-Infinity style of activation-aware prefetching.
+///
+/// <para>
+/// The win over reacting to the current layer's own router is <i>lead time</i>: while
+/// layer L is still computing, the predictor lets the prefetcher start loading layer
+/// L+1's likely experts, hiding the PCIe transfer behind compute. Predictions are only
+/// ever prefetch hints — a wrong guess wastes a transfer but never affects output, so
+/// no accuracy bound is required.
+/// </para>
+///
+/// Pure CPU state; not thread-safe (call from the single decode thread).
+/// </summary>
+public sealed class ExpertRoutePredictor
+{
+    private readonly int _numLayers;
+    private readonly int _maxActive;
+    private readonly int[] _experts;  // [layer * maxActive + k]
+    private readonly int[] _counts;   // valid entries per layer (0 until first seen)
+
+    public ExpertRoutePredictor(int numLayers, int maxActiveExperts)
+    {
+        if (numLayers <= 0) throw new ArgumentOutOfRangeException(nameof(numLayers));
+        if (maxActiveExperts <= 0) throw new ArgumentOutOfRangeException(nameof(maxActiveExperts));
+        _numLayers = numLayers;
+        _maxActive = maxActiveExperts;
+        _experts = new int[numLayers * maxActiveExperts];
+        _counts = new int[numLayers];
+    }
+
+    /// <summary>Record the experts a layer actually selected for the current token.</summary>
+    public void Record(int layer, ReadOnlySpan<int> selected)
+    {
+        if ((uint)layer >= (uint)_numLayers) return;
+        int n = Math.Min(selected.Length, _maxActive);
+        selected[..n].CopyTo(_experts.AsSpan(layer * _maxActive, n));
+        _counts[layer] = n;
+    }
+
+    /// <summary>
+    /// Predict <paramref name="layer"/>'s experts for the next token (its previous-token
+    /// selection). Returns false until the layer has been observed at least once.
+    /// </summary>
+    public bool TryPredict(int layer, out ReadOnlySpan<int> experts)
+    {
+        if ((uint)layer >= (uint)_numLayers || _counts[layer] == 0)
+        {
+            experts = default;
+            return false;
+        }
+        experts = _experts.AsSpan(layer * _maxActive, _counts[layer]);
+        return true;
+    }
+
+    /// <summary>Forget all history (call when starting a new sequence / on cache reset).</summary>
+    public void Reset() => Array.Clear(_counts);
+}
diff --git a/src/SharpInference.Pipeline/MoeCacheSizing.cs b/src/SharpInference.Pipeline/MoeCacheSizing.cs
new file mode 100644
index 0000000..966dad7
--- /dev/null
+++ b/src/SharpInference.Pipeline/MoeCacheSizing.cs
@@ -0,0 +1,84 @@
+namespace SharpInference.Pipeline;
+
+/// <summary>
+/// Sizes the GPU expert SLRU cache for a MoE model. Pure, deterministic, and unit-tested
+/// so the policy is verifiable without a GPU.
+///
+/// <para>
+/// Capacity is bounded below by 1 (the cache must function even if the budget cannot fit
+/// a single expert — callers detect this case via <see cref="MoeCachePlan.Status"/>) and
+/// above by the total GPU-layer expert count; otherwise it is the most the VRAM budget
+/// allows. Separately we compute a <see cref="MoeCachePlan.RecommendedSlots"/> from the
+/// routing-locality finding of "Not All Models Suit Expert Offloading" (arXiv:2505.16056):
+/// a cache of roughly <c>2 × active-experts</c> per layer covers a token segment well.
+/// When the budget forces capacity below that, callers should warn that hit rate may
+/// suffer (fewer GPU layers or more VRAM would help) rather than silently underperform.
+/// </para>
+/// </summary>
+public static class MoeCacheSizing
+{
+    public static MoeCachePlan Plan(
+        int gpuLayers, int numExperts, int numActiveExperts,
+        long freeVramBytes, long perExpertBytes, long reserveBytes)
+    {
+        long total = (long)gpuLayers * numExperts;
+        if (total <= 0) return new MoeCachePlan(0, 0, 0, MoeCacheSizingStatus.Empty);
+
+        MoeCacheSizingStatus status;
+        long byBudget;
+        if (perExpertBytes <= 0)
+        {
+            // Caller couldn't size an expert (missing tensor, dtype unknown). Falling
+            // back to `total` silently would defeat the planner's purpose; surface the
+            // condition so the caller can decide whether to abort or proceed unbounded.
+            byBudget = total;
+            status = MoeCacheSizingStatus.UnknownExpertSize;
+        }
+        else
+        {
+            byBudget = Math.Max(0, (freeVramBytes - reserveBytes) / perExpertBytes);
+            if (byBudget == 0)
+                status = MoeCacheSizingStatus.BudgetExhausted;
+            else if (byBudget < (long)gpuLayers * Math.Min(numExperts, 2 * numActiveExperts))
+                status = MoeCacheSizingStatus.BelowRecommended;
+            else
+                status = MoeCacheSizingStatus.Ok;
+        }
+
+        // Never exceed the total; keep at least one slot so the cache works. Note:
+        // when byBudget==0 (BudgetExhausted) the clamp raises capacity to 1; the
+        // Status enum is the caller's only signal that this happened.
+        int capacity = (int)Math.Clamp(byBudget, 1, total);
+
+        // Locality sweet spot: ~2× active experts per GPU layer (capped at the full set).
+        long recommended = Math.Min(total, (long)gpuLayers * Math.Min(numExperts, 2 * numActiveExperts));
+
+        return new MoeCachePlan(capacity, (int)recommended, (int)Math.Min(byBudget, int.MaxValue), status);
+    }
+}
+
+/// <summary>
+/// Outcome categories for <see cref="MoeCacheSizing.Plan"/>.
+/// </summary>
+public enum MoeCacheSizingStatus
+{
+    /// <summary>No GPU layers or no experts — nothing to size.</summary>
+    Empty,
+    /// <summary>Budget exceeds the locality recommendation; cache fits the working set.</summary>
+    Ok,
+    /// <summary>Budget fits the cache but below the routing-locality recommendation.</summary>
+    BelowRecommended,
+    /// <summary>VRAM budget cannot fit even one expert; capacity was clamped to 1.</summary>
+    BudgetExhausted,
+    /// <summary>Per-expert size unknown (missing tensor); capacity fell back to total.</summary>
+    UnknownExpertSize,
+}
+
+/// <summary>
+/// Result of <see cref="MoeCacheSizing.Plan"/>. <paramref name="Slots"/> is the capacity to
+/// use; <paramref name="RecommendedSlots"/> is the locality-based target (warn if Slots is
+/// materially below it); <paramref name="BudgetSlots"/> is how many slots the VRAM budget
+/// alone would allow (for diagnostics); <paramref name="Status"/> distinguishes
+/// "budget fits cache" from the clamped-to-1 and unknown-expert-size edge cases.
+/// </summary>
+public readonly record struct MoeCachePlan(int Slots, int RecommendedSlots, int BudgetSlots, MoeCacheSizingStatus Status);
diff --git a/src/SharpInference.Pipeline/SlruCache.cs b/src/SharpInference.Pipeline/SlruCache.cs
index 472efb8..6392df1 100644
--- a/src/SharpInference.Pipeline/SlruCache.cs
+++ b/src/SharpInference.Pipeline/SlruCache.cs
@@ -4,7 +4,21 @@ namespace SharpInference.Pipeline;
 /// Segmented LRU (SLRU) cache with probationary and protected segments.
 /// New items enter the probationary segment. Items accessed in probationary
 /// are promoted to the protected segment, exploiting temporal locality.
-/// Eviction always targets the tail of the probationary segment.
+/// Eviction targets the probationary segment.
+///
+/// Two optional refinements (both default-off, so behaviour is plain SLRU
+/// unless configured):
+/// <list type="bullet">
+///   <item><b>Frequency-aware eviction</b> — when a <c>frequencyOf</c> accessor is
+///     supplied, the probationary victim is the <i>least-frequently-accessed</i>
+///     entry (recency breaks ties), rather than the strict LRU tail. This biases
+///     the cache toward keeping hot experts resident under MoE routing skew. The
+///     most-recently-inserted entry is never chosen, avoiding the LFU cold-start
+///     trap of evicting the item we just loaded.</item>
+///   <item><b>Pinning</b> — pinned keys live in the protected segment and are
+///     never evicted or demoted while resident, so a warm set of hot experts can
+///     be guaranteed in fast memory.</item>
+/// </list>
 /// </summary>
 public sealed class SlruCache<TKey, TValue> where TKey : notnull
 {
@@ -12,24 +26,33 @@ public sealed class SlruCache<TKey, TValue> where TKey : notnull
 
     private readonly int _probCapacity;
     private readonly int _protCapacity;
+    private readonly Func<TKey, long>? _frequencyOf;
 
     private readonly LinkedList<Entry> _prob = new();
     private readonly LinkedList<Entry> _prot = new();
     private readonly Dictionary<TKey, LinkedListNode<Entry>> _probIndex = new();
     private readonly Dictionary<TKey, LinkedListNode<Entry>> _protIndex = new();
+    private readonly HashSet<TKey> _pinned = new();
 
     public int Count => _prob.Count + _prot.Count;
     public int ProbationaryCount => _prob.Count;
     public int ProtectedCount => _prot.Count;
+    public int PinnedCount => _pinned.Count;
 
     /// <param name="probationaryCapacity">Slots reserved for newly-inserted (cold) items.</param>
     /// <param name="protectedCapacity">Slots reserved for promoted (hot) items.</param>
-    public SlruCache(int probationaryCapacity, int protectedCapacity)
+    /// <param name="frequencyOf">
+    /// Optional access-count accessor enabling frequency-aware eviction. When null,
+    /// eviction is plain LRU (probationary tail).
+    /// </param>
+    public SlruCache(int probationaryCapacity, int protectedCapacity,
+        Func<TKey, long>? frequencyOf = null)
     {
         if (probationaryCapacity <= 0) throw new ArgumentOutOfRangeException(nameof(probationaryCapacity));
         if (protectedCapacity <= 0) throw new ArgumentOutOfRangeException(nameof(protectedCapacity));
         _probCapacity = probationaryCapacity;
         _protCapacity = protectedCapacity;
+        _frequencyOf = frequencyOf;
     }
 
     /// <summary>Look up <paramref name="key"/>. Promotes probationary hits to protected.</summary>
@@ -46,24 +69,44 @@ public bool TryGet(TKey key, out TValue value)
 
         if (_probIndex.TryGetValue(key, out var probNode))
         {
-            _prob.Remove(probNode);
-            _probIndex.Remove(key);
+            value = probNode.Value.Value;
 
-            // If protected is full, demote its tail to the head of probationary.
-            // The probationary count went down by one (we removed probNode), so
-            // adding the demoted entry keeps probationary ≤ _probCapacity.
-            if (_prot.Count >= _protCapacity)
+            if (_prot.Count < _protCapacity)
+            {
+                // Room in protected: promote directly.
+                _prob.Remove(probNode);
+                _probIndex.Remove(key);
+                _prot.AddFirst(probNode.Value);
+                _protIndex[key] = _prot.First!;
+                return true;
+            }
+
+            // Protected is full: demote the tail-most UNPINNED protected entry to
+            // make room. The probationary count goes -1 (remove probNode) +1 (demoted
+            // in) = unchanged, so it stays ≤ _probCapacity.
+            var demoteNode = LastUnpinnedProtected();
+            if (demoteNode is not null)
             {
-                var demoted = _prot.Last!.Value;
-                _prot.RemoveLast();
+                _prob.Remove(probNode);
+                _probIndex.Remove(key);
+
+                var demoted = demoteNode.Value;
+                _prot.Remove(demoteNode);
                 _protIndex.Remove(demoted.Key);
                 _prob.AddFirst(demoted);
                 _probIndex[demoted.Key] = _prob.First!;
-            }
 
-            _prot.AddFirst(probNode.Value);
-            _protIndex[key] = _prot.First!;
-            value = probNode.Value.Value;
+                _prot.AddFirst(probNode.Value);
+                _protIndex[key] = _prot.First!;
+            }
+            else
+            {
+                // Every protected slot is pinned — cannot promote. Refresh recency
+                // within probationary instead.
+                _prob.Remove(probNode);
+                _prob.AddFirst(probNode.Value);
+                _probIndex[key] = _prob.First!;
+            }
             return true;
         }
 
@@ -71,6 +114,14 @@ public bool TryGet(TKey key, out TValue value)
         return false;
     }
 
+    /// <summary>Tail-most protected node that is not pinned, or null if all are pinned.</summary>
+    private LinkedListNode<Entry>? LastUnpinnedProtected()
+    {
+        for (var node = _prot.Last; node is not null; node = node.Previous)
+            if (!_pinned.Contains(node.Value.Key)) return node;
+        return null;
+    }
+
     /// <summary>
     /// Insert <paramref name="key"/> → <paramref name="value"/> into the probationary segment.
     /// If insertion causes the probationary segment to exceed capacity, the LRU tail is evicted
@@ -84,11 +135,12 @@ public bool Put(TKey key, TValue value, out TKey evictedKey, out TValue evictedV
 
         if (_prob.Count > _probCapacity)
         {
-            var victim = _prob.Last!.Value;
-            _probIndex.Remove(victim.Key);
-            _prob.RemoveLast();
-            evictedKey = victim.Key;
-            evictedValue = victim.Value;
+            var victim = SelectProbationaryVictim();
+            _probIndex.Remove(victim.Value.Key);
+            _prob.Remove(victim);
+            _pinned.Remove(victim.Value.Key); // defensive; pinned entries live in protected
+            evictedKey = victim.Value.Key;
+            evictedValue = victim.Value.Value;
             return true;
         }
 
@@ -97,6 +149,80 @@ public bool Put(TKey key, TValue value, out TKey evictedKey, out TValue evictedV
         return false;
     }
 
+    /// <summary>
+    /// Choose which probationary entry to evict. Never the most-recently-inserted
+    /// entry (<see cref="LinkedList{T}.First"/>) and never a pinned entry. With a
+    /// frequency accessor, picks the least-frequently-accessed candidate (older
+    /// entry breaks ties); otherwise the LRU tail.
+    /// </summary>
+    private LinkedListNode<Entry> SelectProbationaryVictim()
+    {
+        // Walk from tail (oldest) toward head, skipping the just-inserted head and
+        // any pinned entries.
+        LinkedListNode<Entry>? best = null;
+        long bestFreq = long.MaxValue;
+        for (var node = _prob.Last; node is not null && node != _prob.First; node = node.Previous)
+        {
+            if (_pinned.Contains(node.Value.Key)) continue;
+            if (_frequencyOf is null)
+                return node; // first unpinned from the tail == LRU victim
+
+            long freq = _frequencyOf(node.Value.Key);
+            if (freq < bestFreq)
+            {
+                bestFreq = freq;
+                best = node;
+            }
+        }
+        // Fallback when every other entry is pinned: evict the tail regardless.
+        return best ?? _prob.Last!;
+    }
+
+    /// <summary>
+    /// Pin <paramref name="key"/> so it is never evicted or demoted while resident.
+    /// Pinned entries are moved into the protected segment. No-op if the key is not
+    /// currently resident (load it via <see cref="Put"/> first) or already pinned.
+    /// </summary>
+    public void Pin(TKey key)
+    {
+        if (_pinned.Contains(key)) return;
+
+        if (_protIndex.ContainsKey(key))
+        {
+            _pinned.Add(key);
+            return;
+        }
+
+        if (_probIndex.TryGetValue(key, out var node))
+        {
+            _prob.Remove(node);
+            _probIndex.Remove(key);
+            // Demote an unpinned protected tail entry if protected is full so the
+            // pinned entry has a home; if all are pinned it simply grows protected
+            // (bounded by the caller pinning ≤ protected capacity).
+            if (_prot.Count >= _protCapacity)
+            {
+                var demote = LastUnpinnedProtected();
+                if (demote is not null)
+                {
+                    var demoted = demote.Value;
+                    _prot.Remove(demote);
+                    _protIndex.Remove(demoted.Key);
+                    _prob.AddFirst(demoted);
+                    _probIndex[demoted.Key] = _prob.First!;
+                }
+            }
+            _prot.AddFirst(node.Value);
+            _protIndex[key] = _prot.First!;
+            _pinned.Add(key);
+        }
+    }
+
+    /// <summary>Remove the pin on <paramref name="key"/> (entry stays resident in protected).</summary>
+    public void Unpin(TKey key) => _pinned.Remove(key);
+
+    public bool IsPinned(TKey key) => _pinned.Contains(key);
+
     public bool Contains(TKey key) =>
         _protIndex.ContainsKey(key) || _probIndex.ContainsKey(key);
 
@@ -110,5 +236,6 @@ public void Clear()
         _probIndex.Clear();
         _prot.Clear();
         _protIndex.Clear();
+        _pinned.Clear();
     }
 }
diff --git a/tests/SharpInference.Tests.Pipeline/PipelineTests.cs b/tests/SharpInference.Tests.Pipeline/PipelineTests.cs
index 98469b8..bf04ef7 100644
--- a/tests/SharpInference.Tests.Pipeline/PipelineTests.cs
+++ b/tests/SharpInference.Tests.Pipeline/PipelineTests.cs
@@ -101,5 +101,299 @@ public void ExpertAccessProfiler_TopExperts_OrderedByAccess()
         Assert.Equal(3, top[0]); // most accesses (3)
         Assert.Equal(2, top[1]); // second most accesses (2)
     }
+
+    [Fact]
+    public void ExpertAccessProfiler_GetAccessCount_SumsHitsAndMisses()
+    {
+        var profiler = new SharpInference.Pipeline.ExpertAccessProfiler(numLayers: 2, numExperts: 4);
+        profiler.RecordHit(1, 2);
+        profiler.RecordHit(1, 2);
+        profiler.RecordMiss(1, 2);
+        Assert.Equal(3, profiler.GetAccessCount(1, 2));
+        Assert.Equal(0, profiler.GetAccessCount(0, 2)); // different layer untouched
+    }
+
+    // ── Frequency-aware eviction ───────────────────────────────────────────
+
+    [Fact]
+    public void SlruCache_FrequencyAware_EvictsLeastAccessed_NotLruTail()
+    {
+        // freq accessor: key 1 is hot, keys 2 and 3 are cold.
+        var freq = new System.Collections.Generic.Dictionary<int, long> { [1] = 100, [2] = 1, [3] = 1 };
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(
+            probationaryCapacity: 3, protectedCapacity: 1, frequencyOf: k => freq.GetValueOrDefault(k));
+        cache.Put(1, "hot", out _, out _);   // tail-most (oldest) but highest freq
+        cache.Put(2, "cold-a", out _, out _);
+        cache.Put(3, "cold-b", out _, out _);
+        // Insert a 4th → probationary overflows. Plain LRU would evict key 1 (oldest);
+        // frequency-aware keeps the hot key 1 and evicts the least-accessed older entry (key 2).
+        bool evicted = cache.Put(4, "new", out int evKey, out _);
+        Assert.True(evicted);
+        Assert.Equal(2, evKey);
+        Assert.True(cache.Contains(1)); // hot survived despite being oldest
+    }
+
+    [Fact]
+    public void SlruCache_FrequencyAware_NeverEvictsJustInsertedEntry()
+    {
+        // The just-inserted entry has frequency 0 (coldest) but must not be evicted.
+        var freq = new System.Collections.Generic.Dictionary<int, long> { [1] = 5, [2] = 5 };
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(
+            probationaryCapacity: 2, protectedCapacity: 1, frequencyOf: k => freq.GetValueOrDefault(k));
+        cache.Put(1, "a", out _, out _);
+        cache.Put(2, "b", out _, out _);
+        bool evicted = cache.Put(99, "fresh", out int evKey, out _); // freq(99)=0
+        Assert.True(evicted);
+        Assert.NotEqual(99, evKey);       // the fresh insert is protected from immediate eviction
+        Assert.True(cache.Contains(99));
+    }
+
+    // ── Pinning ────────────────────────────────────────────────────────────
+
+    [Fact]
+    public void SlruCache_Pin_MovesToProtectedAndSurvivesEviction()
+    {
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(probationaryCapacity: 2, protectedCapacity: 2);
+        cache.Put(1, "pinme", out _, out _);
+        cache.Pin(1);
+        Assert.True(cache.IsPinned(1));
+        Assert.Equal(1, cache.ProtectedCount);     // pinning moved it to protected
+        Assert.Equal(1, cache.PinnedCount);
+
+        // Churn probationary hard; pinned key 1 must never be evicted.
+        for (int k = 10; k < 30; k++)
+            cache.Put(k, $"v{k}", out _, out _);
+        Assert.True(cache.Contains(1));
+    }
+
+    [Fact]
+    public void SlruCache_Pin_NotEvictedAndNotChosenAsVictim()
+    {
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(probationaryCapacity: 2, protectedCapacity: 1);
+        cache.Put(1, "a", out _, out _);
+        cache.Pin(1);                                  // → protected, pinned
+        cache.Put(2, "b", out _, out _);
+        cache.Put(3, "c", out _, out _);
+        bool evicted = cache.Put(4, "d", out int evKey, out _); // prob overflow (2,3,4) → evict one
+        Assert.True(evicted);
+        Assert.NotEqual(1, evKey);                     // pinned never the victim
+        Assert.True(cache.Contains(1));
+    }
+
+    [Fact]
+    public void SlruCache_Unpin_AllowsEvictionAgain()
+    {
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(probationaryCapacity: 1, protectedCapacity: 1);
+        cache.Put(1, "a", out _, out _);
+        cache.Pin(1);
+        Assert.True(cache.IsPinned(1));
+        cache.Unpin(1);
+        Assert.False(cache.IsPinned(1));
+        Assert.Equal(0, cache.PinnedCount);
+    }
+
+    [Fact]
+    public void SlruCache_Pin_NonResidentKey_IsNoOp()
+    {
+        var cache = new SharpInference.Pipeline.SlruCache<int, string>(probationaryCapacity: 2, protectedCapacity: 2);
+        cache.Pin(42); // not resident
+        Assert.False(cache.IsPinned(42));
+        Assert.Equal(0, cache.PinnedCount);
+    }
+
+    [Fact]
+    public void ExpertCache_Pin_KeepsHotExpertResidentUnderChurn()
+    {
+        var evicted = new System.Collections.Generic.List<string>();
+        var cache = new SharpInference.Pipeline.ExpertCache<string>(capacity: 4, onEvict: evicted.Add);
+        cache.Put(0, 7, "hot");
+        cache.Pin(0, 7);
+        Assert.True(cache.IsPinned(0, 7));
+        for (int e = 100; e < 130; e++)
+            cache.Put(0, e, $"e{e}");
+        Assert.True(cache.TryGet(0, 7, out var v));
+        Assert.Equal("hot", v);
+        Assert.DoesNotContain("hot", evicted);
+        cache.Dispose();
+    }
+
+    [Fact]
+    public void ExpertCache_FrequencyAware_EvictsLeastAccessedExpert()
+    {
+        var accesses = new System.Collections.Generic.Dictionary<(int, int), long>
+        {
+            [(0, 1)] = 50, [(0, 2)] = 1, [(0, 3)] = 1,
+        };
+        string? evicted = null;
+        var cache = new SharpInference.Pipeline.ExpertCache<string>(
+            capacity: 8, onEvict: v => evicted = v,
+            frequencyOf: (l, e) => accesses.GetValueOrDefault((l, e)));
+        // capacity 8 → probCap=2, protCap=6, so "hot" and "cold-a" both fit in
+        // probationary; inserting "cold-b" overflows and must evict the least-accessed
+        // non-head entry — "cold-a" (freq 1), not "hot" (freq 50). With probCap=1 the
+        // head-exclusion alone would force the eviction, never exercising the freq path.
+        cache.Put(0, 1, "hot");
+        cache.Put(0, 2, "cold-a");
+        cache.Put(0, 3, "cold-b");
+        Assert.Equal("cold-a", evicted); // least-accessed evicted, not the hot expert
+        Assert.True(cache.Contains(0, 1)); // hot is retained
+    }
+
+    // ── Predictive prefetch: ExpertRoutePredictor ───────────────────────────
+
+    [Fact]
+    public void ExpertRoutePredictor_UnseenLayer_PredictsNothing()
+    {
+        var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 4, maxActiveExperts: 8);
+        Assert.False(p.TryPredict(0, out _));
+    }
+
+    [Fact]
+    public void ExpertRoutePredictor_RecallsLastSelection()
+    {
+        var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 4, maxActiveExperts: 8);
+        p.Record(2, stackalloc int[] { 5, 9, 13 });
+        Assert.True(p.TryPredict(2, out var pred));
+        Assert.Equal(new[] { 5, 9, 13 }, pred.ToArray());
+        Assert.False(p.TryPredict(3, out _)); // independent per layer
+    }
+
+    [Fact]
+    public void ExpertRoutePredictor_LatestRecordWins()
+    {
+        var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 2, maxActiveExperts: 4);
+        p.Record(0, stackalloc int[] { 1, 2 });
+        p.Record(0, stackalloc int[] { 7, 8, 9 }); // next token's selection replaces
+        Assert.True(p.TryPredict(0, out var pred));
+        Assert.Equal(new[] { 7, 8, 9 }, pred.ToArray());
+    }
+
+    [Fact]
+    public void ExpertRoutePredictor_Reset_ClearsHistory()
+    {
+        var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 2, maxActiveExperts: 4);
+        p.Record(1, stackalloc int[] { 3 });
+        p.Reset();
+        Assert.False(p.TryPredict(1, out _));
+    }
+
+    [Fact]
+    public void ExpertRoutePredictor_ClampsToMaxActive()
+    {
+        var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 1, maxActiveExperts: 2);
+        p.Record(0, stackalloc int[] { 4, 5, 6, 7 }); // more than maxActive
+        Assert.True(p.TryPredict(0, out var pred));
+        Assert.Equal(2, pred.Length);
+        Assert.Equal(new[] { 4, 5 }, pred.ToArray());
+    }
+
+    // ── Model-aware cache sizing: MoeCacheSizing ────────────────────────────
+
+    [Fact]
+    public void MoeCacheSizing_AmpleVram_CapsAtTotalExperts()
+    {
+        // 8 layers × 64 experts = 512; ample free VRAM → capacity == total.
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20);
+        Assert.Equal(512, plan.Slots);
+        Assert.True(plan.BudgetSlots >= 512);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_TightVram_NeverExceedsBudget()
+    {
+        // Only ~50 experts' worth of VRAM free → capacity bounded by budget, not total.
+        long perExpert = 2L << 20;
+        long free = (512L << 20) + 50 * perExpert; // reserve + 50 experts
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20);
+        Assert.Equal(50, plan.Slots);              // exactly what fits
+        Assert.True(plan.Slots <= plan.BudgetSlots);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_RecommendedIsTwiceActivePerLayer()
+    {
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20);
+        Assert.Equal(8 * 2 * 8, plan.RecommendedSlots); // 8 layers × 2×8 active = 128
+    }
+
+    [Fact]
+    public void MoeCacheSizing_TightVram_FlagsBelowRecommended()
+    {
+        long perExpert = 2L << 20;
+        long free = (512L << 20) + 50 * perExpert;
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20);
+        Assert.True(plan.Slots < plan.RecommendedSlots); // 50 < 128 → caller warns
+    }
+
+    [Fact]
+    public void MoeCacheSizing_RecommendedCappedAtTotal_WhenFewExperts()
+    {
+        // 2× active (16) exceeds numExperts (8) → recommended per layer capped at numExperts.
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 4, numExperts: 8, numActiveExperts: 8,
+            freeVramBytes: 100L << 30, perExpertBytes: 1L << 20, reserveBytes: 0);
+        Assert.Equal(4 * 8, plan.RecommendedSlots); // capped at total, not 4×16
+    }
+
+    [Fact]
+    public void MoeCacheSizing_ZeroLayers_ReturnsZero()
+    {
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(0, 64, 8, 1L << 30, 1L << 20, 0);
+        Assert.Equal(0, plan.Slots);
+        Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.Empty, plan.Status);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_AmpleVram_StatusIsOk()
+    {
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20);
+        Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.Ok, plan.Status);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_TightVram_StatusIsBelowRecommended()
+    {
+        long perExpert = 2L << 20;
+        long free = (512L << 20) + 50 * perExpert;
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20);
+        Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.BelowRecommended, plan.Status);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_BudgetExhausted_ClampsToOneAndFlagsStatus()
+    {
+        // Reserve consumes the entire free VRAM → budget = 0 → clamped to 1.
+        // Caller must see the BudgetExhausted status to act on it.
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: 512L << 20, perExpertBytes: 2L << 20, reserveBytes: 512L << 20);
+        Assert.Equal(1, plan.Slots);
+        Assert.Equal(0, plan.BudgetSlots);
+        Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.BudgetExhausted, plan.Status);
+    }
+
+    [Fact]
+    public void MoeCacheSizing_UnknownExpertSize_FlagsStatus()
+    {
+        // perExpertBytes == 0 (caller couldn't measure) → cache falls back to total.
+        // Status flag is the only way the caller can distinguish this from Ok.
+        var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(
+            gpuLayers: 8, numExperts: 64, numActiveExperts: 8,
+            freeVramBytes: 100L << 30, perExpertBytes: 0, reserveBytes: 0);
+        Assert.Equal(8 * 64, plan.Slots);
+        Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.UnknownExpertSize, plan.Status);
+    }
 }