diff --git a/README.md b/README.md index d8cca13..c72eb94 100644 --- a/README.md +++ b/README.md @@ -37,14 +37,14 @@ matching chat template). | OLMoE 1B-7B Instruct (MoE) | [allenai](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF) | 4 GB | CPU | 21.6 | 55.7 | 64 experts / 8 active; per-channel QK-norm; `norm_topk_prob=false` | | OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | Vulkan `-g -1` | 18.9 | **121.2** | 16 layers all on VRAM; greedy on this prompt is unstable across backends — use `--temp 0.6 --top-p 0.95` for usable output | | OLMoE 1B-7B Instruct (MoE) | (same) | 4 GB | **CUDA** `-g -1` | **117.4** | **111.7** | Same; greedy varies, sampling coherent | -| Qwen3-Coder 30B-A3B (MoE) | [Qwen](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF) | 17 GB | CPU | 15.1 | 21.2 | 128 experts / 8 active | -| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | CPU `--tq` | 12.0 | 21.1 | 3-bit KV. FastScan K + V kernels (issue #34) keep attention cost bounded as context grows: **15.5 t/s decode @ 3.2K ctx** (27 % slowdown for ~27× context growth); without FastScan the per-block K+V path would drop this to ~13 t/s | -| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `-g -1` (hybrid) | 1.0 | 5.8 | 29 GPU + 19 CPU layers, SLRU expert slot cache | -| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `-g -1` (hybrid) | **13.9** | **22.7** | 29 GPU + 19 CPU layers (auto), ~2.2× Vulkan decode | -| Llama-4 Scout 17B-16E (MoE) | [meta-llama](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | 61 GB | CPU | 1.9 | 3.9 | 48 layers, 17B active params; split GGUF (Q4_K_M) | -| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `-g -1` (hybrid) | 0.9 | 2.1 | 7 GPU + 41 CPU layers — model dwarfs the 12 GB card, PCIe cost > GPU speedup so CPU-only wins here | -| Qwen3.6-35B-A3B (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | 22 GB | CPU | 4.3 | 7.8 | hybrid GDN/attn, 256 experts / 8 active | -| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1` (hybrid) | **11.2** | **23.8** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, batched-expert dispatch (8 experts × 3 ops into 2 Parallel.For sweeps), shared expert kept on GPU and overlapped with the CPU routed loop | +| Qwen3-Coder 30B-A3B (MoE) | [Qwen](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF) | 17 GB | CPU | 13.3 | 21.1 | 128 experts / 8 active | +| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | CPU `--tq` | 13.7 | 21.0 | 3-bit KV. FastScan K + V kernels (issue #34) keep attention cost bounded as context grows: **15.5 t/s decode @ 3.2K ctx** (27 % slowdown for ~27× context growth); without FastScan the per-block K+V path would drop this to ~13 t/s | +| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | Vulkan `-g -1` (hybrid) | 1.1 | 5.3 | 29 GPU + 19 CPU layers, SLRU expert slot cache. Next-layer predictive prefetch (PR #77 / issue #50) on by default; no-op until the cache is under pressure — disable with `--no-moe-predict-prefetch` | +| Qwen3-Coder 30B-A3B (MoE) | (same) | 17 GB | **CUDA** `-g -1` (hybrid) | 10.6 | 22.2 | 29 GPU + 19 CPU layers; routed experts stream through `CudaExpertSlotManager` SLRU (2220 / 3712 slots) instead of the prior eager whole-layer upload (PR #77 / issue #72). Decode is at parity with the eager baseline (22.7 → 22.2 within noise); prefill drops 13.9 → 10.6 because the first-pass loads experts on demand. Cache budget is set to actually-fit-VRAM (counts the buffer pool's power-of-two round-up — earlier 3043-slot planning would have hit `cudaMalloc` failure once a working set filled past ~2200 unique experts). Set `SHARPI_EXPERT_STATS=path` to inspect per-layer hit rates | +| Llama-4 Scout 17B-16E (MoE) | [meta-llama](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | 61 GB | CPU | 2.1 | 4.3 | 48 layers, 17B active params; split GGUF (Q4_K_M) | +| Llama-4 Scout 17B-16E (MoE) | (same) | 61 GB | CUDA `-g -1` (hybrid) | 1.2 | 2.6 | 7 GPU + 41 CPU layers — model still dwarfs the 12 GB card so CPU-only wins, but per-expert SLRU streaming (PR #77 / issue #72) lifts decode 2.1 → 2.6 (+24 %) and prefill 0.9 → 1.2 (+33 %) over the prior eager whole-layer upload | +| Qwen3.6-35B-A3B (GDN+MoE) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | 22 GB | CPU | 6.7 | 8.5 | hybrid GDN/attn, 256 experts / 8 active | +| Qwen3.6-35B-A3B (GDN+MoE) | (same) | 22 GB | **CUDA** `-g -1` (hybrid) | **14.7** | **23.2** | 10 attn + 30 GDN on GPU; MoE auto-routed to CPU, batched-expert dispatch (8 experts × 3 ops into 2 Parallel.For sweeps), shared expert kept on GPU and overlapped with the CPU routed loop | | Qwen3.6-27B-MTP (GDN) | [unsloth](https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF) | 16 GB | CPU `--no-thinking` | 2.8 | **3.8** | dense 27B, hybrid GDN/attn, native MTP head; auto-engages MTP self-spec (issue #25) at greedy + `--no-thinking`. 95% draft acceptance (38/40); batched N=2 verify (#30) + fused Q6_K·Q8_K 2-input dot (#42) lift decode from 2.7 (sequential N=1) to 3.8 — 1.4× over MTP-off baseline | | Qwen3.6-27B-MTP (GDN) | (same) | 16 GB | **CUDA** `-g -1 --no-thinking` (hybrid) | **5.7** | **10.7** | 20/64 dense FFN layers on GPU (3.3 GB) + GDN + attn KV resident; 44/64 FFN layers on CPU mmap. 95% draft acceptance; batched verify lifts decode from 6.2 to 10.7 (1.73× over MTP-off baseline). The CPU FFN majority batches via `CpuDenseFfn2` and the on-GPU FFN layers now batch via `MatMulN2` (issue #43 — one weight read per row, two outputs). Direct-pinned `Download/UploadInto` (#48) and async `_lastHidden` overlap (#49) shave per-layer host stall on the MoE-MTP/dense-FFN-MTP hot path | | Qwen3.6-27B-MTP (GDN) | (same) | 19 GB | CPU `--no-thinking` `Q5_K_M` | 2.5 | **3.5** | Q5_K_M variant, ~10% slower than Q4_K_M as expected from weight bandwidth. 100% draft acceptance (40/40) on this prompt; batched verify lifts decode from 2.4 to 3.5 (1.46×) | @@ -55,6 +55,13 @@ matching chat template). `--backend auto` (default) picks CUDA when available, sizing the GPU/CPU split from VRAM via TierPlanner; falls through to Vulkan only when CUDA isn't present. +MoE expert-cache knobs (`--moe-warmpin`, `--moe-warmpin-after`, +`--no-moe-predict-prefetch`, `--expert-stats`) are CLI-only on the `run` +command; under `SharpInference.Server` the same behaviour is reachable +via the env vars `SHARPI_MOE_WARMPIN`, `SHARPI_MOE_WARMPIN_AFTER`, +`SHARPI_MOE_PREDICT_PREFETCH=0`, `SHARPI_EXPERT_STATS=` set in the +process environment before the server starts. + SnapKV prefill-time KV eviction (issue #51) ships on every backend: CPU `ForwardPass` (#57), CUDA hybrid GDN `CudaHybridGdnForwardPass` (#58), dense CUDA `CudaForwardPass` (#63), and Vulkan `GpuForwardPass` (#64). diff --git a/docs/moe-expert-offloading-research.md b/docs/moe-expert-offloading-research.md new file mode 100644 index 0000000..928fdd5 --- /dev/null +++ b/docs/moe-expert-offloading-research.md @@ -0,0 +1,340 @@ +# MoE & Expert Offloading: State of the Art vs. SharpInference + +> Research note, 2026-05. Surveys the current literature on Mixture-of-Experts +> (MoE) inference and expert offloading, audits what SharpInference does today +> (with file references), and identifies concrete gaps and opportunities tailored +> to our target deployment (single-user desktop, ~12 GB VRAM e.g. RTX 4070 Ti, +> NativeAOT, GGUF). + +> **Status update (PR #77, 2026-05-29).** The sections below were written before +> the implementation work in this PR landed. Reflecting what now ships, several +> "open" items in §1 (TL;DR), §2.2 (CUDA non-GDN hybrid), §4 (gap analysis), and +> §5 (recommendations) are now closed: +> +> - **§5 P0 — non-GDN CUDA per-expert SLRU.** `CudaHybridForwardPass` now streams +> routed experts through `CudaExpertSlotManager.GetOrLoad`. The eager +> `_gpuWGateExps`/`_gpuWUpExps`/`_gpuWDownExps` arrays referenced in §2.2 no +> longer exist (commit `2e48a29`). +> - **§5 P1 (eviction half) — activation-aware caching.** `SlruCache` now picks +> victims by access frequency (`SlruCache.SelectProbationaryVictim`) and the +> slot managers support opt-in warm-pinning via `SHARPI_MOE_WARMPIN` +> (commit `b6b2763`). +> - **§4 gap on predictive prefetch (Vulkan).** `ExpertRoutePredictor` records +> each layer's selection and prefetches the next GPU MoE layer's likely experts +> a layer ahead (commit `757976d`). +> - **Cache sizing.** `MoeCacheSizing.Plan` is the routing-locality-aware sizer +> recommended in §5 P3 (commit `7b70c76`). +> +> Still open: the §5 P1 Q5_K-→-F32 dequant on the Vulkan `ExpertSlotManager` +> upload path; the §5 P2 CPU-peer / KTransformers-AMX direction; the predictive +> prefetch on the CUDA hybrid path (Vulkan-only today). + +--- + +## 1. TL;DR + +SharpInference already has a **solid offloading skeleton** that maps onto several +SOTA ideas — an SLRU expert cache, an async prefetcher, CPU-fallback compute on +cache miss (the core "Fiddler" trick), and per-expert access profiling. That +puts us ahead of naive "swap on demand" systems. + +The implementation is uneven across paths. The audit found: + +1. **Per-expert SLRU offloading exists on two of three hybrid paths, but not the + third.** The **GDN CUDA path** (`CudaHybridGdnForwardPass`, used for + qwen35moe) and the **Vulkan path** (`HybridForwardPass`) both stream experts + through the SLRU cache (`CudaExpertSlotManager.GetOrLoad` / + `ExpertSlotManager.TryGetCached`) with CPU-fallback compute on miss. But the + **non-GDN CUDA hybrid path** (`CudaHybridForwardPass`, used for Mixtral / + Qwen3-30B-A3B / Qwen3-Coder when they don't fit VRAM) does **whole-layer** + offload only: every expert of a GPU-tier layer is uploaded resident, and its + `_expertSlotManager`/`_prefetcher` fields are declared but never assigned (dead + code). So for the big non-GDN MoEs, a "GPU layer" must hold its *entire* expert + set in VRAM — there is no per-expert streaming, only the coarse CPU-layer / + GPU-layer split that `TierPlanner` decides. +2. **Cached experts are quantized — except Q5_K in two spots.** Good news first: + experts are cached in native quant (Q4_K/Q6_K everywhere; Q5_K too on + `CudaExpertSlotManager`), so we are *not* generally paying an F32 premium. But + the **Vulkan `ExpertSlotManager`** and the non-GDN CUDA resident path + **dequantize Q5_K to F32** (`ExpertSlotManager.cs:156`, + `CudaHybridForwardPass.cs:1428`). qwen35moe stores `ffn_down_exps` as Q5_K, so + on Vulkan every cached down-projection expert is 4 B/element — 4× its source. + `CudaExpertSlotManager` already keeps Q5_K raw (`UploadRaw`); the other two + paths just need to mirror it. +3. **Prefetching is reactive, not predictive.** The Vulkan path re-enqueues the + experts the router *just* selected, betting the next token reuses them at the + same layer (1-token, same-layer temporal locality). Every SOTA system instead + predicts the *next layer's* or *next token's* experts ahead of time. *(Already + tracked as issue #50 — pre-gated / PreScope-style predictive prefetch.)* +4. **Caching is recency-only; the profiler is diagnostic-only.** The SLRU evicts + by recency. `ExpertAccessProfiler` tracks per-expert hit/miss and prints stats + (`CudaHybridGdnForwardPass` dump), but nothing feeds hotness back into eviction + priority or warm-pins hot experts at load. `TierPlanner` places layers by + footprint, not access frequency. + +The highest-leverage, lowest-risk wins for our use case are: **(a)** bring the +non-GDN CUDA hybrid path to per-expert SLRU parity with the GDN path (so big +non-GDN MoEs fit in less VRAM), **(b)** stop dequantizing Q5_K experts on the +Vulkan/resident paths, **(c)** add next-layer expert *prediction* to drive the +prefetcher (#50), **(d)** make eviction/placement activation-aware using the +profiler we already built, and **(e)** a fast CPU expert GEMM (KTransformers-style, +related to #54). Details and priorities in §5. + +--- + +## 2. What SharpInference does today + +Verified against the source on this branch. + +### 2.1 MoE model support +- Architecture/hparam detection in `src/SharpInference.Core/ModelGraph.cs` and + `ModelHyperparams.cs` (`IsMoE`, `NumExperts`, `NumActiveExperts`, + `ExpertIntermediateDim`, `HasSharedExpert`, `NormalizeMoeTopKWeights`, + `UseSigmoidGating`). +- Covers Mixtral (top-2), Qwen3-MoE / qwen35moe (256 experts, top-8, shared + expert, GDN-hybrid), OLMoE (top-1), DeepSeek-V2 family, Llama4-style MoE. +- GGUF stores experts as packed per-layer tensors + (`blk.{L}.ffn_{gate,up,down}_exps.weight`) plus optional shared-expert and + router (`ffn_gate_inp`) tensors; loaded zero-copy via mmap. + +### 2.2 Forward-pass MoE +- **CPU** (`ForwardPass.cs`): router GEMV → softmax/sigmoid → `SelectTopK` → + optional shared expert → sparse routed experts via pointer-sliced mmap weights, + SIMD `MatVec`. Solid, correct, the reference path. +- **Vulkan hybrid** (`HybridForwardPass.GpuMoeFfn`, ~line 1293): router on GPU, + top-k on CPU, then **per-selected-expert SLRU cache lookup** + (`_expertSlotManager.TryGetCached`). **Cache miss → compute that expert on the + CPU while the GPU is idle** (`GpuMoeFfnCpuFallback`), accumulate, upload, GPU + `AddInPlace`. This is the Fiddler idea (compute on CPU rather than block on a + transfer) and is genuinely good. +- **CUDA GDN hybrid** (`CudaHybridGdnForwardPass.cs`, for qwen35moe): the most + developed path. Experts served by `CudaExpertSlotManager` SLRU + (`GetOrLoad`, line ~2257), keeps Q4_K/Q5_K/Q6_K quantized, has a CPU-MoE mode + (`SHARPI_CPU_MOE=1`), and dumps `ExpertAccessProfiler` stats on dispose. +- **CUDA non-GDN hybrid** (`CudaHybridForwardPass.cs`, for Mixtral / Qwen3-MoE / + Qwen3-Coder too big for VRAM): GPU-tier layers upload **all** experts to VRAM as + `Tensor[][] _gpuWGateExps/...` (line ~297) and index them directly (line ~1348); + CPU-tier layers compute on CPU (`CpuMoeFfn`). Offload granularity is the whole + layer (`TierPlanner` split) — there is **no per-expert SLRU streaming** here. The + `_expertSlotManager`/`_prefetcher` fields (lines 108–109) are declared and + disposed but **never assigned** → the dynamic cache path is dead on this path. + Experts are kept in native quant for Q4_K/Q6_K (line ~1418) but Q5_K is + dequantized to F32 (line ~1428). + +### 2.3 Offloading infrastructure (`SharpInference.Pipeline` + Engine) +- `SlruCache` — segmented LRU, 25% probationary / 75% protected, evicts + probationary tail. `ExpertCache` wraps it keyed by `(layer, expertId)`. +- `ExpertSlotManager` / `CudaExpertSlotManager` — VRAM expert slot cache; + `TryGetCached`/`GetOrLoad`, `Preload`, eviction callback frees GPU tensors. + Keeps experts in native quant — **except** the Vulkan `ExpertSlotManager` + dequantizes Q5_K (and exotic dtypes) to F32 (`ExpertSlotManager.cs:156`), while + `CudaExpertSlotManager` keeps Q4_K/Q5_K/Q6_K all raw (`UploadRaw`, line ~162). +- `MoEPrefetcher` — bounded channel + background worker calling + `slotManager.Preload`. Drops oldest when full. Wired **only** in the Vulkan + path, and only with `EnqueuePrefetch(layer, selectedExperts)` — i.e. the + experts already selected for the current layer/token. +- `ExpertAccessProfiler` — per-`(layer,expert)` hit/miss counters, `OverallHitRate`, + `GetTopExperts`. Diagnostic only; not consumed by placement or eviction. +- `TierPlanner` — greedy layer placement by **footprint** + KV budget. Not + access-aware. +- `MemoryHierarchy` — 3-tier (VRAM → pinned RAM → NVMe) design; L3/NVMe + + io_uring is a stub (`NotImplementedYet`). + +--- + +## 3. State of the art (2024–2026) + +Expert offloading exists because MoE activates only k-of-N experts per token, so +most expert weights can live in slow memory (host RAM / SSD / CPU) and only the +active few need to be in fast memory (VRAM). The whole game is **hiding the cost +of getting the right experts into fast memory in time**, or avoiding the move +entirely. The literature attacks this along six axes. + +### Axis A — Static placement / partitioning +Decide once, offline, what lives where. +- **KTransformers** (SOSP'25) — partition by *arithmetic intensity*: attention + and frequently-used experts on GPU, the rest computed on CPU with + highly-optimized kernels (AMX / AVX-512, llamafile-style sgemm). Reports + 1.25–1.93× over llama.cpp, much more on quantized models; runs DeepSeek-R1/V3 + (671B) on a single 24 GB GPU + big DRAM. Key lesson: **CPU expert compute is a + first-class path, not just a fallback** — with good kernels you don't move the + weights at all. +- **llama.cpp** `--cpu-moe` / `--n-cpu-moe` / `-ot "exps=CPU"` — the practical + baseline: keep attention + shared experts (always active) on GPU, routed + experts on CPU. This is essentially what our CUDA path does statically. +- **Local Routing Consistency** ("Not All Models Suit Expert Offloading", 2505.16056) + — *which* models even benefit from caching. Metrics SRP and SCH over 20 MoE + LLMs: models that put MoE on **every** layer and use **no shared expert** have + the highest locality (best cache hit rates); shared experts and dense-then-MoE + layouts hurt locality. Most models do well with a cache ≈ **2× the active + expert count**. Directly relevant to choosing cache sizes per model. + +### Axis B — Caching policy +What to keep resident and what to evict. +- LRU/LFU/SLRU (what we have) vs. **activation-aware** caches. +- **MoE-Infinity** (2401.14361) — sequence-level activation *tracing* to capture + temporal locality, then prioritize caching experts by predicted activation + ratio; 4–20× latency reduction vs. baselines. +- **HybriMoE** (2504.05897) — *score-based* caching + dynamic intra-layer + CPU/GPU scheduling, built on KTransformers; handles expert-activation + instability across tokens. + +### Axis C — Predictive prefetching (the big one) +Move experts *before* you need them, overlapping I/O with compute. +- **Pre-gated MoE** (ISCA'24) — add a "pre-gate" so layer L computes layer L+1's + expert selection, giving a full layer of prefetch lead time. Algorithm+system + co-design. +- **Cross-Layer Gate / "Fate"** (2502.12224) — predict future-layer experts from + *current* layer's gate inputs; offloading system with prefetch + caching + + quantization, tuned for edge/memory budgets. +- **ProMoE** (2410.22134) — proactive caching that predicts and preloads expert + usage to cut cache misses, separating prefill/decode behavior. +- **AdapMoE** (2408.10284), **fMoE** (2502.05370), **ExpertFlow** (2410.17954) — + sensitivity-based gating, fine-grained prefetch+cache, and predictive + routing-path offload with token reordering (up to 93% VRAM savings, 2–10×). + +### Axis D — Speculation-driven offloading +Use a draft/speculative process to predict experts many tokens ahead. +- **MoE-SpeQ** (2511.14102) — small on-device draft model predicts the *sequence* + of experts for future tokens; a runtime orchestrator prefetches them from host + memory to overlap I/O with compute. Introduces an "Amortization Roofline Model" + to tune the speculation window for throughput. +- **SP-MoE** (2510.10302), **MoE-SpAc** (2603.09983) — speculative decoding + + prefetch co-design; speculation doubles as a memory-management signal. +- **OD-MoE** reports up to **99.94%** expert-activation prediction with shadow + networks; single-layer lookahead alone gives ~84–91%. + +### Axis E — Mixed-precision / compression of experts +Don't pay full precision for cold experts. +- **HOBBIT** (2411.01433) — mixed-precision expert offloading: load *less + important* experts at lower precision (cheaper transfer + less VRAM), critical + experts at full precision; token-/layer-/sequence-level prefetch + caching. + Built on llama.cpp; significant speedups with negligible quality loss. **Most + directly relevant to our F32-cache problem.** +- **PreMoe** (2505.17639) — probabilistic expert *pruning* + task-adaptive + retrieval to fit big MoEs in constrained memory. + +### Axis F — Cache-miss tolerance & batching +- **BuddyMoE** (2511.10054) — on a cache miss, substitute a *redundant/similar* + expert already resident rather than stalling, exploiting expert redundancy. +- **ExpertFlow** token reordering / expert buffering ("Towards MoE Deployment", + 2303.06182) — reorder tokens so a batch activates fewer distinct experts. + (Most relevant once we have continuous batching for MoE, which we don't yet.) + +--- + +## 4. Gap analysis + +| SOTA capability | SharpInference today | Gap | +|---|---|---| +| Per-expert offloading across all backends | Vulkan + CUDA-GDN: per-expert SLRU. CUDA non-GDN: whole-layer offload, all experts resident. | Non-GDN CUDA MoE (Mixtral/Qwen3-30B-A3B/Coder) can't stream experts → must fit a layer's full expert set in VRAM. | +| Mixed-precision / quantized cache (HOBBIT) | Experts cached in native quant — **except Q5_K → F32** on Vulkan SLRU + non-GDN CUDA resident path | Q5_K (qwen35moe `ffn_down_exps`) costs 4 B/elem on Vulkan. No *down*-quantization of cold experts (true HOBBIT). | +| Activation-aware caching (MoE-Infinity, HybriMoE) | SLRU (recency only) + diagnostic-only `ExpertAccessProfiler` | Profiler doesn't drive eviction/placement or warm-pin hot experts. SLRU ≠ frequency-aware. | +| **Predictive prefetch** (Pre-gated, Cross-Layer Gate, ProMoE) | Reactive 1-token, **same-layer** re-enqueue (Vulkan) | No next-layer/next-token prediction. *Tracked: #50.* | +| Cache-miss CPU compute (Fiddler) | CPU-fallback compute on Vulkan + (via `SHARPI_CPU_MOE`) GDN paths | *Per-dispatch* CPU/GPU decision tracked as #54. | +| Speculative expert prefetch (MoE-SpeQ, SP-MoE) | speculative *decoding* (MTP) exists, not used for expert prediction | Not started; natural extension of #50 once draft accepts are available. | +| Fast CPU expert GEMM (KTransformers AMX) | SIMD `MatVec` (GEMV) CPU fallback | No AMX / blocked sgemm; CPU treated as fallback, not a peer compute tier (related to #54). | +| Cache-miss substitution (BuddyMoE) | block on CPU compute instead | Fine for single-user; no redundancy reuse. | +| Model-aware cache sizing (Local Routing Consistency) | fixed slot capacity | No per-model locality measurement; could size cache ≈2× active experts, skip caching for low-locality models. | +| L3 NVMe tier / io_uring | designed, stubbed | Not shipped; only matters for models > host RAM. | +| Batched MoE + token reorder (ExpertFlow) | MoE batching disabled | Out of scope for single-user target. | + +--- + +## 5. Recommendations (prioritized for our target) + +Ordered by leverage/effort for the single-user desktop, ~12 GB VRAM, GGUF case. +Most of these reuse infrastructure we already have. + +### P0 — Bring the non-GDN CUDA hybrid path to per-expert SLRU parity +Wire `CudaExpertSlotManager` + `MoEPrefetcher` + CPU-fallback into +`CudaHybridForwardPass` (the fields are already there, just unassigned) so it +streams experts per-token like the GDN path (`CudaHybridGdnForwardPass`) and the +Vulkan path — instead of forcing every expert of a GPU-tier layer to be resident. +Today a non-GDN MoE bigger than VRAM (Mixtral, Qwen3-30B-A3B, Qwen3-Coder-30B) can +only offload at whole-layer granularity, wasting VRAM on cold experts in the +GPU-tier layers. The GDN path is the reference implementation to mirror. +*Risk:* medium — but the exact pattern already exists in-repo to copy. + +### P1 — Stop dequantizing Q5_K experts (Vulkan + non-GDN CUDA resident path) +`CudaExpertSlotManager` already keeps Q5_K raw via `UploadRaw`; mirror that in the +Vulkan `ExpertSlotManager` (`ExpertSlotManager.cs:156`) and the +`CudaHybridForwardPass` resident upload (`:1428`). qwen35moe's `ffn_down_exps` is +Q5_K, so this 4×'s the cached down-proj footprint on Vulkan today. Cheap, local, +high-certainty. +*Risk:* low — Q5_K dequant-in-matmul kernels already exist. + +### P1 — Predictive prefetch (Cross-Layer Gate / Pre-gated) — *issue #50* +Already filed. Replace the same-layer 1-token re-enqueue with **next-layer +prediction**: run the *next* MoE layer's router on the *current* hidden state (an +`embDim×numExperts` GEMV — tiny) to prefetch a layer ahead. ~84–91% accuracy for +single-layer lookahead is plenty to convert blocking misses into overlapped loads. +Biggest latency win in the offloaded regime; purely a prefetch hint. + +### P1 — Feed the profiler into eviction & warm-pinning +We built `ExpertAccessProfiler` but only print it. Use it two ways: **(1)** at +load, warm the cache / pin the top-N experts per layer (KTransformers/MoE-Infinity +"hot experts on GPU"); **(2)** bias SLRU so high-frequency experts resist eviction +(frequency-aware, not pure recency). +*Risk:* low. + +### P2 — Treat CPU as a compute peer (KTransformers-style) — *relates to #54* +Add a blocked/multi-threaded expert GEMM (and explore AVX-512/AMX where available) +so CPU-resident experts are computed cheaply rather than something we always try +to avoid by moving to GPU. Pairs naturally with the CPU-fallback dispatch policy +in #54 — it makes "compute on CPU" the *intended* steady state for cold experts, +à la KTransformers, rather than a stall. +*Risk:* medium (kernel work), high payoff for big-model offload. + +### P2 — Model-aware cache policy (Local Routing Consistency) +Measure/lookup per-model routing locality and size the expert cache accordingly +(≈2× active experts is a good default), and detect low-locality models (shared +expert + dense-then-MoE) where caching helps little — pin shared experts, don't +over-invest cache slots on routed ones. +*Risk:* low. + +### P3 — Speculative expert prefetch (MoE-SpeQ) — *extends #50* +We already have speculative decoding (MTP). Reuse its drafted tokens to prefetch +the experts those tokens will need, several tokens ahead — a natural extension of +the #50 predictor. Big win only in the heavily-offloaded regime; defer until the +P0/P1 items land. + +### Out of scope for now +L3/NVMe tier (only matters when model > host RAM), continuous-batching MoE + +token reordering (single-user target), GDN GPU kernels (orthogonal to offloading). + +--- + +## 6. Bottom line + +We're not missing the *concept* of expert offloading — the bones are good, the +GDN CUDA path is genuinely SOTA-aligned (per-expert SLRU, CPU fallback, quantized +cache, profiling), and predictive prefetch (#50) and Fiddler dispatch (#54) are +already on the board. What's missing is *parity and follow-through*: the non-GDN +CUDA path still does coarse whole-layer offload, Q5_K experts get needlessly +expanded to F32 on two paths, and the profiler we built doesn't yet steer caching. +Those are mostly wiring over infrastructure we already have, and they're what +stands between "MoE that fits in VRAM" and "MoE that's bigger than VRAM but still +fast." + +--- + +## 7. References + +- KTransformers — CPU/GPU Hybrid Inference for MoE (SOSP'25): https://dl.acm.org/doi/10.1145/3731569.3764843 +- llama.cpp MoE offload guide (`-ot`/`--cpu-moe`): https://huggingface.co/blog/Doctor-Shotgun/llamacpp-moe-offload-guide +- Not All Models Suit Expert Offloading (Local Routing Consistency): https://hf.co/papers/2505.16056 +- MoE-Infinity — Activation-Aware Expert Offloading: https://hf.co/papers/2401.14361 +- HybriMoE — Hybrid CPU-GPU Scheduling & Cache: https://hf.co/papers/2504.05897 +- Pre-gated MoE (ISCA'24): https://www.microsoft.com/en-us/research/wp-content/uploads/2024/05/isca24_pregated_moe_camera_ready.pdf +- Cross-Layer Gate / Fate — Accurate Expert Predictions: https://hf.co/papers/2502.12224 +- ProMoE — Proactive Caching: https://hf.co/papers/2410.22134 +- AdapMoE — Sensitivity-based Gating/Management: https://hf.co/papers/2408.10284 +- fMoE — Fine-Grained Expert Offloading: https://hf.co/papers/2502.05370 +- ExpertFlow — Predictive Routing + Token Scheduling: https://hf.co/papers/2410.17954 +- HOBBIT — Mixed-Precision Expert Offloading: https://hf.co/papers/2411.01433 +- PreMoe — Expert Pruning & Retrieval: https://hf.co/papers/2505.17639 +- MoE-SpeQ — Speculative Quantized Decoding + Prefetch: https://arxiv.org/abs/2511.14102 +- SP-MoE — Speculative Decoding & Prefetching: https://arxiv.org/pdf/2510.10302 +- BuddyMoE — Expert Redundancy for Cache Misses: https://arxiv.org/html/2511.10054v1 +- Towards MoE Deployment (Expert Buffering / token reorder): https://hf.co/papers/2303.06182 diff --git a/scripts/bench-moe-rerun.ps1 b/scripts/bench-moe-rerun.ps1 new file mode 100644 index 0000000..db666b2 --- /dev/null +++ b/scripts/bench-moe-rerun.ps1 @@ -0,0 +1,90 @@ +param( + [int]$NTokens = 80, + [int]$Repeats = 3, + [string]$Prompt = "Write a Python function that sorts a list using the quicksort algorithm:", + [string]$OutJson = "tools\bench\moe-rerun.json" +) + +# Same model paths as scripts/bench-all.ps1 +$moe = "models\Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf" + +$cases = @( + @{ Tag = "moe-cpu"; Extra = @(); Timeout = 360 }, + @{ Tag = "moe-cpu-tq"; Extra = @("--tq"); Timeout = 360 }, + @{ Tag = "moe-vulkan-hybrid"; Extra = @("-g","-1","--backend","vulkan"); Timeout = 600 }, + @{ Tag = "moe-cuda-hybrid"; Extra = @("-g","-1","--backend","cuda"); Timeout = 600 } +) + +$all = @() +foreach ($case in $cases) { + for ($i = 1; $i -le $Repeats; $i++) { + $runTag = "$($case.Tag)-r$i" + Write-Host "" + Write-Host "=== $runTag ($($i)/$Repeats) ===" -ForegroundColor Yellow + $r = .\scripts\bench-textgen.ps1 -Model $moe -Tag $runTag -NTokens $NTokens -Prompt $Prompt ` + -TimeoutSec $case.Timeout -ExtraArgs $case.Extra + $r | Add-Member -NotePropertyName Group -NotePropertyValue $case.Tag -Force + $r | Add-Member -NotePropertyName Run -NotePropertyValue $i -Force + $all += $r + Write-Host (" prefill={0} t/s decode={1} t/s wall={2}s" -f $r.PrefillTps, $r.DecodeTps, $r.WallSec) + } +} + +# Aggregate per (group, metric) +function Stats($values) { + if ($values.Count -eq 0) { return @{ Mean = 0.0; Std = 0.0 } } + $mean = ($values | Measure-Object -Average).Average + if ($values.Count -lt 2) { return @{ Mean = [Math]::Round($mean,2); Std = 0.0 } } + $sumsq = 0.0 + foreach ($v in $values) { $sumsq += ($v - $mean) * ($v - $mean) } + $std = [Math]::Sqrt($sumsq / ($values.Count - 1)) + return @{ Mean = [Math]::Round($mean,2); Std = [Math]::Round($std,3) } +} + +$baselines = @{ + "moe-cpu" = @{ Prefill = 14.9; Decode = 21.4 } + "moe-cpu-tq" = @{ Prefill = 12.4; Decode = 21.4 } + "moe-vulkan-hybrid" = @{ Prefill = 1.0; Decode = 5.5 } + "moe-cuda-hybrid" = @{ Prefill = 16.1; Decode = 22.4 } +} + +$summary = @() +foreach ($case in $cases) { + $rows = $all | Where-Object { $_.Group -eq $case.Tag } + $pf = Stats ($rows | ForEach-Object { [double]$_.PrefillTps }) + $dc = Stats ($rows | ForEach-Object { [double]$_.DecodeTps }) + $bp = $baselines[$case.Tag].Prefill + $bd = $baselines[$case.Tag].Decode + # Z-scores: (mean - baseline) / std. Negative => below baseline. + $zp = if ($pf.Std -gt 0) { [Math]::Round(($pf.Mean - $bp) / $pf.Std, 2) } else { 0.0 } + $zd = if ($dc.Std -gt 0) { [Math]::Round(($dc.Mean - $bd) / $dc.Std, 2) } else { 0.0 } + $summary += [PSCustomObject]@{ + Tag = $case.Tag + BasePf = $bp + BaseDc = $bd + PfMean = $pf.Mean + PfStd = $pf.Std + PfZ = $zp + PfDeltaPct = [Math]::Round(100.0 * ($pf.Mean - $bp) / $bp, 1) + DcMean = $dc.Mean + DcStd = $dc.Std + DcZ = $zd + DcDeltaPct = [Math]::Round(100.0 * ($dc.Mean - $bd) / $bd, 1) + } +} + +Write-Host "" +Write-Host "=== Per-run rates ===" -ForegroundColor Cyan +$all | Format-Table Group, Run, PrefillTps, DecodeTps, WallSec, TimedOut -AutoSize + +Write-Host "" +Write-Host "=== Mean ± stddev vs baseline ===" -ForegroundColor Cyan +$summary | Format-Table Tag, BasePf, PfMean, PfStd, PfZ, PfDeltaPct, BaseDc, DcMean, DcStd, DcZ, DcDeltaPct -AutoSize + +$payload = [PSCustomObject]@{ + Runs = $all + Summary = $summary +} +$payload | ConvertTo-Json -Depth 5 | Set-Content $OutJson +Write-Host "" +Write-Host "Results written to $OutJson" -ForegroundColor DarkGray diff --git a/src/SharpInference.Cli/RunCommand.cs b/src/SharpInference.Cli/RunCommand.cs index 8b8fa6b..a055d26 100644 --- a/src/SharpInference.Cli/RunCommand.cs +++ b/src/SharpInference.Cli/RunCommand.cs @@ -148,6 +148,28 @@ public sealed class Settings : CommandSettings [Description("Maximum reasoning tokens before forcing . 0 = unlimited (default). Not honored on the speculative-decode path.")] [DefaultValue(0)] public int MaxThinkingTokens { get; init; } + + // ── MoE expert-cache tuning (offloaded MoE models) ── + // Good defaults are automatic: frequency-aware SLRU eviction, VRAM-sized cache, + // and next-layer predictive prefetch are all ON without any flag. These knobs only + // tune/disable that behaviour. Each is also settable via the named env var. + [CommandOption("--no-moe-predict-prefetch")] + [Description("MoE: disable next-layer predictive expert prefetch (Vulkan; on by default). Env: SHARPI_MOE_PREDICT_PREFETCH=0.")] + [DefaultValue(false)] + public bool NoMoePredictPrefetch { get; init; } + + [CommandOption("--moe-warmpin")] + [Description("MoE: also pin the top-N hottest experts per layer into the GPU cache after warmup (default 0 = off; frequency-aware eviction already retains hot experts). Env: SHARPI_MOE_WARMPIN.")] + public int? MoeWarmPin { get; init; } + + [CommandOption("--moe-warmpin-after")] + [Description("MoE: expert accesses to observe before warm-pinning selects the hot set (default 512). Only used with --moe-warmpin. Env: SHARPI_MOE_WARMPIN_AFTER.")] + [DefaultValue(0L)] + public long MoeWarmPinAfter { get; init; } + + [CommandOption("--expert-stats")] + [Description("MoE: write GPU expert-cache (SLRU) hit-rate stats to this file on exit. Env: SHARPI_EXPERT_STATS.")] + public string? ExpertStatsPath { get; init; } } protected override int Execute(CommandContext context, Settings settings, CancellationToken cancellation) @@ -155,6 +177,19 @@ protected override int Execute(CommandContext context, Settings settings, Cancel if (settings.MinBatchBlas > 0) SimdKernels.MinBatchForBlas = settings.MinBatchBlas; + // MoE expert-cache knobs are read from the environment inside the engine + // (WarmPinConfig / HybridForwardPass / slot-manager dispose). Surface them as + // CLI flags by setting the env var here — before any forward pass is built — + // so an explicit flag overrides, and env-only use still works. + if (settings.MoeWarmPin is int warmPin) // explicitly passed (incl. 0 to force off) + Environment.SetEnvironmentVariable("SHARPI_MOE_WARMPIN", warmPin.ToString()); + if (settings.MoeWarmPinAfter > 0) + Environment.SetEnvironmentVariable("SHARPI_MOE_WARMPIN_AFTER", settings.MoeWarmPinAfter.ToString()); + if (settings.NoMoePredictPrefetch) + Environment.SetEnvironmentVariable("SHARPI_MOE_PREDICT_PREFETCH", "0"); + if (!string.IsNullOrEmpty(settings.ExpertStatsPath)) + Environment.SetEnvironmentVariable("SHARPI_EXPERT_STATS", settings.ExpertStatsPath); + var modelPath = settings.ModelPath; if (modelPath is null) { @@ -275,8 +310,8 @@ protected override int Execute(CommandContext context, Settings settings, Cancel case "": // Auto: pick CUDA when available. CudaForwardPass handles full-offload // (dense + MoE); CudaHybridForwardPass handles partial-offload (dense or - // MoE with eager per-layer expert loading). TQ on CUDA requires - // head_dim ∈ {128, 256}. + // MoE; routed experts stream through the CudaExpertSlotManager SLRU). + // TQ on CUDA requires head_dim ∈ {128, 256}. bool tqHeadDimOk = hp.HeadDim is 128 or 256; wantCuda = (!settings.TurboQuant || tqHeadDimOk) && CudaBackend.IsAvailable(); diff --git a/src/SharpInference.Cuda/CudaBackend.cs b/src/SharpInference.Cuda/CudaBackend.cs index b2187d8..b738e28 100644 --- a/src/SharpInference.Cuda/CudaBackend.cs +++ b/src/SharpInference.Cuda/CudaBackend.cs @@ -16,6 +16,16 @@ namespace SharpInference.Cuda; /// public sealed unsafe class CudaBackend : IComputeBackend, IImageOpsBackend, IDisposable { + /// + /// Round up to the bucket size the buffer pool will + /// actually allocate (next power-of-two, min 64 bytes). Use this when sizing + /// budgets that share VRAM with pooled allocations — the pool's round-up can + /// inflate per-allocation footprint up to ~2× and a budget computed from raw + /// byte sizes will overshoot real capacity. Bypasses the pool entirely when + /// callers pass exact: true to / . + /// + public static nuint RoundUpAllocBytes(nuint byteSize) => GpuBufferPool.RoundUp(byteSize); + private readonly nint _handle; private readonly SgemmPrecision _precision; private readonly int _smVersion; diff --git a/src/SharpInference.Engine/CudaExpertSlotManager.cs b/src/SharpInference.Engine/CudaExpertSlotManager.cs index 1719777..66e29a1 100644 --- a/src/SharpInference.Engine/CudaExpertSlotManager.cs +++ b/src/SharpInference.Engine/CudaExpertSlotManager.cs @@ -33,6 +33,12 @@ public sealed class CudaExpertSlotManager : IDisposable private readonly object _lock = new(); private bool _disposed; + // Opt-in warm-pinning of hot experts (SHARPI_MOE_WARMPIN=N). Disabled by default. + private readonly int _warmPinPerLayer; + private readonly long _warmPinAfter; + private readonly int _pinBudget; + private bool _warmed; + public ExpertAccessProfiler Profiler => _profiler; /// CUDA backend to allocate/free GPU tensors on. @@ -55,7 +61,13 @@ public CudaExpertSlotManager(CudaBackend gpu, GgufModel model, ModelHyperparams _hp = hp; _dtypes = dtypes; _profiler = new ExpertAccessProfiler(hp.NumLayers, hp.NumExperts); - _cache = new ExpertCache(slotCapacity, EvictSlot); + // Frequency-aware eviction: under MoE routing skew, the least-accessed + // probationary expert is a better victim than the strict LRU tail. + _cache = new ExpertCache(slotCapacity, EvictSlot, + frequencyOf: _profiler.GetAccessCount); + _warmPinPerLayer = WarmPinConfig.PerLayer; + _warmPinAfter = WarmPinConfig.AfterAccesses; + _pinBudget = Math.Max(1, slotCapacity / 2); // never pin more than half the cache } /// @@ -86,10 +98,41 @@ public ExpertCudaSlot GetOrLoad(int layer, int expertId) _profiler.RecordMiss(layer, expertId); slot = UploadExpert(layer, expertId); _cache.Put(layer, expertId, slot); + MaybeWarmPin(); return slot; } } + /// + /// Once enough routing history has accumulated, pin the hottest currently-resident + /// experts (top SHARPI_MOE_WARMPIN per layer) into the protected segment so + /// they are never evicted. No-op unless warm-pinning is enabled. Runs once, under + /// the caller's lock. Layers are visited in descending hotness so a tight pin + /// budget protects the layers that route most often, not whatever happens to sit + /// at low indices (matters for hybrid GDN+MoE models where MoE FFN sits at high + /// layer indices). + /// + private void MaybeWarmPin() + { + if (_warmed || _warmPinPerLayer <= 0) return; + if (_profiler.TotalHits + _profiler.TotalMisses < _warmPinAfter) return; + _warmed = true; + var layerOrder = new int[_hp.NumLayers]; + for (int l = 0; l < _hp.NumLayers; l++) layerOrder[l] = l; + Array.Sort(layerOrder, (a, b) => _profiler.GetLayerAccessCount(b).CompareTo(_profiler.GetLayerAccessCount(a))); + int pinned = 0; + foreach (int layer in layerOrder) + { + if (pinned >= _pinBudget) break; + if (_profiler.GetLayerAccessCount(layer) == 0) break; + foreach (int e in _profiler.GetTopExperts(layer, _warmPinPerLayer)) + { + if (pinned >= _pinBudget) break; + if (_cache.Contains(layer, e)) { _cache.Pin(layer, e); pinned++; } + } + } + } + /// /// Pre-load the given expert into the cache if not already present. /// diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs index f81a273..6bb4d2b 100644 --- a/src/SharpInference.Engine/CudaHybridForwardPass.cs +++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs @@ -37,12 +37,7 @@ public sealed unsafe class CudaHybridForwardPass : IForwardPass private readonly Tensor[] _gpuAttnNorm, _gpuWq, _gpuWk, _gpuWv, _gpuWo; private readonly Tensor[] _gpuFfnNorm, _gpuWGate, _gpuWUp, _gpuWDown; private readonly Tensor[]? _gpuWGateInp, _gpuWGateShexp, _gpuWUpShexp, _gpuWDownShexp; - // Eager per-expert weights for CUDA GPU layers — every expert is VRAM-resident, - // so the per-token MoE FFN is a straight indexed lookup. Different from the Vulkan - // hybrid's lazy SLRU slot cache: simpler, but the model must fit in VRAM after - // accounting for KV cache + scratch. Sized [nGpuLayers][numExperts]; null when - // not MoE or there are no GPU layers. - private readonly Tensor[][]? _gpuWGateExps, _gpuWUpExps, _gpuWDownExps; + // Attention bias tensors for GPU layers (null when the model has no attention bias). private readonly Tensor[]? _gpuBq, _gpuBk, _gpuBv, _gpuBo; private readonly Tensor[]? _gpuQNorm, _gpuKNorm; private readonly Tensor[] _gpuKCache, _gpuVCache; @@ -99,18 +94,14 @@ public sealed unsafe class CudaHybridForwardPass : IForwardPass private readonly float* _ropeSinTable; private readonly int _ropeHalfDim; - // ── Expert slot cache (for MoE GPU layers with lazy/evictable expert loading) ── - // These fields stay declared for symmetry with HybridForwardPass — CUDA hybrid - // currently refuses MoE+GPU at construction time, so the MoE GPU dispatch below - // is unreachable and these fields are always null. Pragma-suppressed so the - // unused-readonly check doesn't elevate to error under TreatWarningsAsErrors. -#pragma warning disable CS0649 - private readonly ExpertSlotManager? _expertSlotManager; - private readonly MoEPrefetcher? _prefetcher; - private readonly Tensor? _gpuFallbackContrib; - private readonly Tensor? _gpuPinnedNorm; -#pragma warning restore CS0649 - // CUDA hybrid has no CPU expert fallback (every expert is VRAM-resident). + // ── Expert slot cache (MoE GPU layers, lazy/evictable expert loading) ── + // Routed experts for GPU-tier MoE layers are streamed through this SLRU cache + // (mirror of CudaHybridGdnForwardPass), rather than every expert being uploaded + // resident. This lets non-GDN MoE models (Mixtral, Qwen3-30B-A3B, Qwen3-Coder) + // run with more layers on the GPU than the full expert footprint would allow. + // Null when not MoE or there are no GPU layers. Loads are synchronous on miss + // (no prefetcher): the GDN path established this is fast enough for k=8 decode. + private readonly CudaExpertSlotManager? _expertSlotManager; public int MaxSeqLen => _maxSeqLen; public LayerPlacement Placement => _placement; @@ -239,9 +230,6 @@ void TraceVram(string label) _gpuFfnNorm = new Tensor[_nGpuLayers]; _gpuWGate = new Tensor[_nGpuLayers]; _gpuWUp = new Tensor[_nGpuLayers]; _gpuWDown = new Tensor[_nGpuLayers]; _gpuWGateInp = _isMoE ? new Tensor[_nGpuLayers] : null; - _gpuWGateExps = _isMoE ? new Tensor[_nGpuLayers][] : null; - _gpuWUpExps = _isMoE ? new Tensor[_nGpuLayers][] : null; - _gpuWDownExps = _isMoE ? new Tensor[_nGpuLayers][] : null; _gpuWGateShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null; _gpuWUpShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null; _gpuWDownShexp = _isMoE && _hasSharedExpert ? new Tensor[_nGpuLayers] : null; @@ -291,12 +279,9 @@ void TraceVram(string label) if (_isMoE) { _gpuWGateInp![i] = UploadWeight($"blk.{i}.ffn_gate_inp.weight"); - // Eager per-expert upload — every expert is VRAM-resident. Required because - // CUDA hybrid does not yet ship the SLRU slot cache. The TierPlanner is - // expected to size nGpuLayers so the total expert footprint fits. - _gpuWGateExps![i] = UploadExpertWeights($"blk.{i}.ffn_gate_exps.weight", _expertDim, _embDim, hp.NumExperts); - _gpuWUpExps![i] = UploadExpertWeights($"blk.{i}.ffn_up_exps.weight", _expertDim, _embDim, hp.NumExperts); - _gpuWDownExps![i] = UploadExpertWeights($"blk.{i}.ffn_down_exps.weight", _embDim, _expertDim, hp.NumExperts); + // Routed experts are NOT uploaded here — they stream through the + // CudaExpertSlotManager SLRU cache (created after this loop). The router + // and shared expert stay resident since they run on every token. if (_hasSharedExpert) { _gpuWGateShexp![i] = UploadWeight($"blk.{i}.ffn_gate_shexp.weight"); @@ -342,12 +327,53 @@ void TraceVram(string label) Console.Error.WriteLine(" done."); TraceVram("after all weight uploads"); - // MoE GPU layers use eager expert loading (all experts VRAM-resident). - // No slot manager / SLRU cache — keep it simple at the cost of slightly more - // VRAM per GPU layer. TierPlanner sizes nGpuLayers so the total expert footprint - // fits the remaining VRAM budget. + // MoE GPU layers stream routed experts through an SLRU cache. Size it from + // the *actual* free VRAM remaining now that attention weights, KV cache and + // scratch are uploaded (cudaMemGetInfo via FreeVramBytes), capped at the full + // GPU-layer expert count. Capping at totalGpuExperts means the cache can never + // hold more than the old eager path did — which TierPlanner already verified + // fits — so there is no new OOM risk; the budget term only *shrinks* capacity + // when VRAM is tight (e.g. the user forced extra GPU layers via -g), enabling + // streaming instead of an OOM. if (_isMoE && _nGpuLayers > 0) - Console.Error.WriteLine($"[CudaHybridForwardPass] MoE eager expert load: {hp.NumExperts} experts × {_nGpuLayers} GPU layers (gate+up+down)."); + { + long perExpert = PerExpertBytes(); + long reserve = 512L << 20; // 512 MiB headroom for transient per-GEMM scratch + long free = (long)gpu.FreeVramBytes; + var plan = MoeCacheSizing.Plan(_nGpuLayers, hp.NumExperts, hp.NumActiveExperts, + free, perExpert, reserve); + int totalGpuExperts = _nGpuLayers * hp.NumExperts; + Console.Error.WriteLine( + $"[CudaHybridForwardPass] SLRU expert cache: {plan.Slots} slots / {totalGpuExperts} total " + + $"({hp.NumExperts} experts × {_nGpuLayers} GPU layers, per-expert ≈ {perExpert / 1024} KiB, " + + $"free VRAM ≈ {free / (1024 * 1024)} MiB)."); + switch (plan.Status) + { + case MoeCacheSizingStatus.BudgetExhausted: + // Budget couldn't fit even one expert; capacity was clamped to 1. + // Decode will thrash (~every routed expert misses); louder than the + // BelowRecommended warning because the perf hit is catastrophic. + Console.Error.WriteLine( + "[CudaHybridForwardPass] WARNING: free VRAM cannot fit a single expert; " + + "cache clamped to 1 slot. Every routed expert will miss and stream from CPU. " + + "Reduce -g or use --backend vulkan."); + break; + case MoeCacheSizingStatus.UnknownExpertSize: + Console.Error.WriteLine( + "[CudaHybridForwardPass] WARNING: could not measure per-expert size " + + "(missing blk.0.ffn_*_exps tensor); cache fell back to total. " + + "Will fail at runtime if total VRAM is exceeded."); + break; + case MoeCacheSizingStatus.BelowRecommended: + int pct = plan.RecommendedSlots > 0 ? plan.Slots * 100 / plan.RecommendedSlots : 0; + Console.Error.WriteLine( + $"[CudaHybridForwardPass] WARNING: cache ({plan.Slots}) is {pct}% of the " + + $"routing-locality recommendation (~{plan.RecommendedSlots} = 2× active per layer); " + + $"expert hit rate may suffer. Fewer GPU layers (-g) or more VRAM would help."); + break; + } + _expertSlotManager = new CudaExpertSlotManager(gpu, model, hp, plan.Slots, _gpuWeightDTypes); + } // ── Resolve CPU weights (layers nGpuLayers..numLayers-1) ── _cpuHidden = Alloc(_embDim); @@ -1311,10 +1337,9 @@ private void GpuDenseFfn(int layer) private void GpuMoeFfn(int layer) { - // Eager-loaded variant of GpuForwardPass.GpuMoeFfn / CudaForwardPass.MoeFfn: - // every expert is VRAM-resident, so the per-token MoE FFN is a straight - // indexed lookup over (_gpuWGateExps/_gpuWUpExps/_gpuWDownExps)[layer][expertIdx]. - // No slot manager, no CPU fallback, no host-coherent pinned norm copy — CUDA's + // SLRU-streamed variant (mirror of CudaHybridGdnForwardPass.GpuMoeFfn): each + // selected routed expert is fetched via _expertSlotManager.GetOrLoad, which + // returns a cached slot or synchronously uploads-then-caches on miss. CUDA's // implicit stream ordering removes the explicit barrier vocabulary the Vulkan // version needs. int numActive = _hp.NumActiveExperts; @@ -1344,9 +1369,10 @@ private void GpuMoeFfn(int layer) { int expertIdx = selectedExperts[i]; float expertWeight = expertWeights[i]; + var slot = _expertSlotManager!.GetOrLoad(layer, expertIdx); - GpuMatMul(_gpuFfnGate, _gpuWGateExps![layer][expertIdx], _gpuNormBuf); - GpuMatMul(_gpuFfnUp, _gpuWUpExps![layer][expertIdx], _gpuNormBuf); + GpuMatMul(_gpuFfnGate, slot.Gate, _gpuNormBuf); + GpuMatMul(_gpuFfnUp, slot.Up, _gpuNormBuf); if (_hp.UseSigmoidGating) { @@ -1355,7 +1381,7 @@ private void GpuMoeFfn(int layer) } _gpu.SiLuMul(_gpuFfnGate, _gpuFfnUp); - GpuMatMul(_gpuMoeExpertOut!, _gpuWDownExps![layer][expertIdx], _gpuFfnGate); + GpuMatMul(_gpuMoeExpertOut!, slot.Down, _gpuFfnGate); if (_hp.UseSigmoidGating) _gpu.AddInPlace(_gpuHidden, _gpuMoeExpertOut!); @@ -1367,16 +1393,30 @@ private void GpuMoeFfn(int layer) _gpu.AddInPlace(_gpuHidden, _gpuMoeSharedOut!); } - // GpuMoeFfnCpuFallback removed for CUDA hybrid: every expert is eagerly resident - // on the GPU, so there's no slot-cache miss to spill to CPU. ExpertMatVec is still - // used by CpuMoeFfn for the all-CPU layer slice. + // Routed-expert weights are uploaded lazily by CudaExpertSlotManager on cache + // miss, not eagerly here. (The shared expert and router stay resident.) - private Tensor[] UploadExpertWeights(string name, int rows, int cols, int expertCount) + /// + /// On-VRAM bytes for one expert's three weight tensors (gate + up + down), used to + /// size the SLRU slot capacity. Mirrors CudaExpertSlotManager's upload accounting: + /// Q4_K/Q5_K/Q6_K are stored raw; other dtypes expand to F32. Each tensor's raw + /// byte size is rounded up to the buffer pool's allocation bucket (power-of-two, + /// min 64 B) — otherwise the planner over-estimates capacity by ~2× since pooled + /// allocations inflate sub-bucket sizes (e.g. 1.05 MiB Q5_K tensor → 2 MiB). + /// + private long PerExpertBytes() { - var tensors = new Tensor[expertCount]; - for (int expertIdx = 0; expertIdx < expertCount; expertIdx++) - tensors[expertIdx] = UploadExpertWeight(name, rows, cols, expertIdx); - return tensors; + long Bytes(string name, int rows, int cols) + { + if (_model.FindTensor(name) is not { } info) return 0; + long raw = info.DType is DType.Q4_K or DType.Q5_K or DType.Q6_K + ? (long)rows * (cols / DTypeInfo.BlockSize(info.DType)) * DTypeInfo.BytesPerBlock(info.DType) + : (long)rows * cols * sizeof(float); // F32 (native or dequantized) + return (long)CudaBackend.RoundUpAllocBytes((nuint)raw); + } + return Bytes("blk.0.ffn_gate_exps.weight", _expertDim, _embDim) + + Bytes("blk.0.ffn_up_exps.weight", _expertDim, _embDim) + + Bytes("blk.0.ffn_down_exps.weight", _embDim, _expertDim); } private Tensor UploadTqSignPatterns(int layerIndex) @@ -1391,48 +1431,6 @@ private Tensor UploadTqSignPatterns(int layerIndex) return _gpu.Upload(fullSigns, TensorShape.D1(fullSigns.Length)); } - private Tensor UploadExpertWeight(string name, int rows, int cols, int expertIdx) - { - var info = _model.FindTensor(name) - ?? throw new InvalidOperationException($"Missing tensor: {name}"); - var data = _model.GetTensorData(info); - - // exact=true on every branch: expert weights are session-lifetime. Pool - // round-up across (NumExperts × NumLayers) FFN tensors is the dominant - // VRAM-waste source on MoE models — reclaiming it widens the SLRU slot count. - if (info.DType == DType.Float32) - { - int elemOffset = expertIdx * rows * cols; - var floats = MemoryMarshal.Cast(data).Slice(elemOffset, rows * cols); - var result = _gpu.Upload(floats, TensorShape.D1(floats.Length), exact: true); - _gpuWeightDTypes[result.Handle] = DType.Float32; - return result; - } - - int bytesPerRow = (cols / DTypeInfo.BlockSize(info.DType)) - * DTypeInfo.BytesPerBlock(info.DType); - int expertBytes = rows * bytesPerRow; - int byteOffset = expertIdx * expertBytes; - var expertData = data.Slice(byteOffset, expertBytes); - - if (info.DType == DType.Q4_K || info.DType == DType.Q6_K) - { - int floatCount = expertData.Length / 4; - var rawFloats = new float[floatCount]; - expertData.CopyTo(MemoryMarshal.AsBytes(rawFloats.AsSpan())); - var result = _gpu.Upload(rawFloats, TensorShape.D1(floatCount), exact: true); - _gpuWeightDTypes[result.Handle] = info.DType; - return result; - } - - int count = rows * cols; - var f32 = new float[count]; - Dequantize.ToFloat32(expertData, f32, info.DType, count); - var tensor = _gpu.Upload(f32, TensorShape.D1(count), exact: true); - _gpuWeightDTypes[tensor.Handle] = DType.Float32; - return tensor; - } - private static void SelectTopK(ReadOnlySpan logits, int k, Span indices, Span weights, bool normalize) { @@ -1535,13 +1533,7 @@ public void Dispose() if (_isMoE) { _gpu.Free(_gpuWGateInp![i]); - // Eager per-expert tensors: free every slot since there's no slot manager here. - for (int e = 0; e < _hp.NumExperts; e++) - { - _gpu.Free(_gpuWGateExps![i][e]); - _gpu.Free(_gpuWUpExps![i][e]); - _gpu.Free(_gpuWDownExps![i][e]); - } + // Routed-expert tensors are owned by _expertSlotManager (freed in its Dispose). if (_hasSharedExpert) { _gpu.Free(_gpuWGateShexp![i]); @@ -1595,9 +1587,26 @@ public void Dispose() if (_cpuDecompBuf != null) NativeMemory.Free(_cpuDecompBuf); _cpuKvCache.Dispose(); _cpuTqKvCache?.Dispose(); - _prefetcher?.Dispose(); - _expertSlotManager?.Dispose(); - if (_gpuFallbackContrib is not null) _gpu.Free(_gpuFallbackContrib); - if (_gpuPinnedNorm is not null) _gpu.Free(_gpuPinnedNorm); + if (_expertSlotManager is not null) + { + // SHARPI_EXPERT_STATS=: dump SLRU hit rate + top experts per layer + // (parity with CudaHybridGdnForwardPass). + var statsPath = Environment.GetEnvironmentVariable("SHARPI_EXPERT_STATS"); + if (!string.IsNullOrEmpty(statsPath)) + { + // Diagnostic-only: a write failure must never skip the slot manager's + // Dispose below (which frees GPU tensors), so swallow + log. + try + { + using var w = new StreamWriter(statsPath); + _expertSlotManager.Profiler.PrintStats(w); + } + catch (Exception ex) + { + Console.Error.WriteLine($"[CudaHybridForwardPass] Failed to write expert stats to {statsPath}: {ex.Message}"); + } + } + _expertSlotManager.Dispose(); + } } } diff --git a/src/SharpInference.Engine/ExpertSlotManager.cs b/src/SharpInference.Engine/ExpertSlotManager.cs index 05fe2db..038d601 100644 --- a/src/SharpInference.Engine/ExpertSlotManager.cs +++ b/src/SharpInference.Engine/ExpertSlotManager.cs @@ -24,6 +24,12 @@ public sealed class ExpertSlotManager : IDisposable private readonly object _lock = new(); private bool _disposed; + // Opt-in warm-pinning of hot experts (SHARPI_MOE_WARMPIN=N). Disabled by default. + private readonly int _warmPinPerLayer; + private readonly long _warmPinAfter; + private readonly int _pinBudget; + private bool _warmed; + public ExpertAccessProfiler Profiler => _profiler; /// Vulkan backend to allocate/free GPU tensors on. @@ -44,18 +50,32 @@ public ExpertSlotManager(VulkanBackend gpu, GgufModel model, ModelHyperparams hp _hp = hp; _dtypes = dtypes; _profiler = new ExpertAccessProfiler(hp.NumLayers, hp.NumExperts); - _cache = new ExpertCache(slotCapacity, EvictSlot); + // Frequency-aware eviction: under MoE routing skew, the least-accessed + // probationary expert is a better victim than the strict LRU tail. + _cache = new ExpertCache(slotCapacity, EvictSlot, + frequencyOf: _profiler.GetAccessCount); + _warmPinPerLayer = WarmPinConfig.PerLayer; + _warmPinAfter = WarmPinConfig.AfterAccesses; + _pinBudget = Math.Max(1, slotCapacity / 2); // never pin more than half the cache } /// /// Return the GPU tensors for the given expert only if they are already cached. /// Does NOT load from disk on miss — use for that. + /// Records the lookup outcome on the profiler so frequency-aware eviction + + /// warm-pinning have real data on the Vulkan path (where the forward pass + /// never calls ). /// Thread-safe. /// public bool TryGetCached(int layer, int expertId, out ExpertGpuSlot slot) { lock (_lock) - return _cache.TryGet(layer, expertId, out slot); + { + bool hit = _cache.TryGet(layer, expertId, out slot); + if (hit) _profiler.RecordHit(layer, expertId); + else _profiler.RecordMiss(layer, expertId); + return hit; + } } /// @@ -75,14 +95,48 @@ public ExpertGpuSlot GetOrLoad(int layer, int expertId) _profiler.RecordMiss(layer, expertId); slot = UploadExpert(layer, expertId); _cache.Put(layer, expertId, slot); + MaybeWarmPin(); return slot; } } + /// + /// Once enough routing history has accumulated, pin the hottest currently-resident + /// experts (top SHARPI_MOE_WARMPIN per layer) into the protected segment so + /// they are never evicted. No-op unless warm-pinning is enabled. Runs once, under + /// the caller's lock. Layers are visited in descending hotness so a tight pin + /// budget protects the layers that route most often, not whatever happens to sit + /// at low indices (matters for hybrid GDN+MoE models where MoE FFN sits at high + /// layer indices). + /// + private void MaybeWarmPin() + { + if (_warmed || _warmPinPerLayer <= 0) return; + if (_profiler.TotalHits + _profiler.TotalMisses < _warmPinAfter) return; + _warmed = true; + var layerOrder = new int[_hp.NumLayers]; + for (int l = 0; l < _hp.NumLayers; l++) layerOrder[l] = l; + Array.Sort(layerOrder, (a, b) => _profiler.GetLayerAccessCount(b).CompareTo(_profiler.GetLayerAccessCount(a))); + int pinned = 0; + foreach (int layer in layerOrder) + { + if (pinned >= _pinBudget) break; + // Cold layers contribute nothing; once the sort hits zero we can stop. + if (_profiler.GetLayerAccessCount(layer) == 0) break; + foreach (int e in _profiler.GetTopExperts(layer, _warmPinPerLayer)) + { + if (pinned >= _pinBudget) break; + if (_cache.Contains(layer, e)) { _cache.Pin(layer, e); pinned++; } + } + } + } + /// /// Pre-load the given expert into the cache if not already present. /// Uses so it is safe to call /// from a background thread concurrently with the main recording session. + /// Also runs so warm-pinning fires on the Vulkan + /// path (where is never called by the forward pass). /// public void Preload(int layer, int expertId) { @@ -93,6 +147,7 @@ public void Preload(int layer, int expertId) var slot = UploadExpert(layer, expertId, background: true); _cache.Put(layer, expertId, slot); } + MaybeWarmPin(); } } diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs index 66d4cd3..afd618c 100644 --- a/src/SharpInference.Engine/HybridForwardPass.cs +++ b/src/SharpInference.Engine/HybridForwardPass.cs @@ -97,6 +97,30 @@ public sealed unsafe class HybridForwardPass : IForwardPass // ── Expert slot cache (for MoE GPU layers with lazy/evictable expert loading) ── private ExpertSlotManager? _expertSlotManager; private MoEPrefetcher? _prefetcher; + // Next-layer predictive prefetch: records each layer's expert selection and prefetches + // the next GPU MoE layer's likely experts a layer ahead. ON by default (it only makes + // the already-on background prefetch smarter, and is a no-op when experts aren't being + // evicted); disable with SHARPI_MOE_PREDICT_PREFETCH=0 (or --no-moe-predict-prefetch). + private readonly ExpertRoutePredictor? _routePredictor; + private readonly bool _predictPrefetch = ParsePredictPrefetchFlag(); + + private static bool ParsePredictPrefetchFlag() + { + var s = Environment.GetEnvironmentVariable("SHARPI_MOE_PREDICT_PREFETCH"); + if (string.IsNullOrEmpty(s)) return true; // default on + switch (s.Trim().ToLowerInvariant()) + { + case "0": case "false": case "off": case "no": case "disabled": + return false; + case "1": case "true": case "on": case "yes": case "enabled": + return true; + default: + Console.Error.WriteLine( + $"[HybridForwardPass] SHARPI_MOE_PREDICT_PREFETCH='{s}' not recognized; defaulting to ON. " + + "Accepted: 1/0, true/false, on/off, yes/no (case-insensitive)."); + return true; + } + } // Pinned host-visible GPU tensor for uploading CPU fallback contributions to GPU hidden state. private Tensor? _gpuFallbackContrib; // Pinned host-visible GPU tensor for reading the norm buffer on CPU without a separate Download. @@ -330,6 +354,8 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp : totalExperts; _expertSlotManager = new ExpertSlotManager(gpu, model, hp, capacity, _gpuWeightDTypes); _prefetcher = new MoEPrefetcher(_expertSlotManager); + if (_predictPrefetch) + _routePredictor = new ExpertRoutePredictor(_nGpuLayers, hp.NumActiveExperts); _gpuFallbackContrib = gpu.AllocatePinned(TensorShape.D1(_embDim)); _gpuPinnedNorm = gpu.AllocatePinned(TensorShape.D1(_embDim)); Console.Error.WriteLine($"[HybridForwardPass] MoE expert slot cache: {capacity} slots ({hp.NumExperts} experts × {_nGpuLayers} layers), SLRU lazy-load."); @@ -613,6 +639,7 @@ public void ResetCache() _cpuTqKvCache.Reset(); else _cpuKvCache.Reset(); + _routePredictor?.Reset(); } /// @@ -1321,8 +1348,10 @@ private void GpuMoeFfn(int layer) // is idle (between EndRecordAndSubmit and the next BeginRecord). // Their weighted outputs are accumulated in _cpuFallbackBuf and // uploaded to the pre-allocated pinned tensor for GPU AddInPlace. - // Prefetch the same experts for the next token (1-token lookahead). - _prefetcher?.EnqueuePrefetch(layer, selectedExperts); + + // Look up current-layer experts FIRST. TryGetCached promotes hits from + // probationary to protected, so the slots needed for this token are + // safe from the prefetcher's next Preload eviction below. Span isGpu = stackalloc bool[numActive]; // ExpertGpuSlot contains Tensor (managed reference type fields) — heap-allocate. ExpertGpuSlot[] cachedSlots = new ExpertGpuSlot[numActive]; @@ -1334,6 +1363,23 @@ private void GpuMoeFfn(int layer) if (!isGpu[i]) hasCpuFallback = true; } + // Now enqueue prefetches. Order matters: the worker thread may race ahead + // and evict probationary slots — by promoting current-layer slots first + // (above), a next-layer prefetch can't evict what this token needs. + // Same-layer 1-token-ahead prefetch (cheap, always on). + _prefetcher?.EnqueuePrefetch(layer, selectedExperts); + + // ── Next-layer predictive prefetch (opt-in) ── + // Record this layer's selection, then prefetch the *next* GPU MoE layer's likely + // experts (its previous-token selection) so they load while this layer finishes — + // a full layer of lead time. Best-effort: wrong guesses only waste a transfer. + if (_routePredictor is not null) + { + _routePredictor.Record(layer, selectedExperts); + if (layer + 1 < _nGpuLayers && _routePredictor.TryPredict(layer + 1, out var nextExperts)) + _prefetcher?.EnqueuePrefetch(layer + 1, nextExperts); + } + if (hasCpuFallback) { // _gpuPinnedNorm was populated by the GPU session above — map it directly, @@ -1671,7 +1717,26 @@ public void Dispose() _cpuKvCache.Dispose(); _cpuTqKvCache?.Dispose(); _prefetcher?.Dispose(); - _expertSlotManager?.Dispose(); + if (_expertSlotManager is not null) + { + // SHARPI_EXPERT_STATS=: parity with the CUDA hybrid forward passes + // (CudaHybridForwardPass/CudaHybridGdnForwardPass) so the CLI flag works + // on every MoE backend, not just CUDA. + var statsPath = Environment.GetEnvironmentVariable("SHARPI_EXPERT_STATS"); + if (!string.IsNullOrEmpty(statsPath)) + { + try + { + using var w = new StreamWriter(statsPath); + _expertSlotManager.Profiler.PrintStats(w); + } + catch (Exception ex) + { + Console.Error.WriteLine($"[HybridForwardPass] Failed to write expert stats to {statsPath}: {ex.Message}"); + } + } + _expertSlotManager.Dispose(); + } if (_gpuFallbackContrib is not null) _gpu.Free(_gpuFallbackContrib); if (_gpuPinnedNorm is not null) _gpu.Free(_gpuPinnedNorm); } diff --git a/src/SharpInference.Engine/MoEPrefetcher.cs b/src/SharpInference.Engine/MoEPrefetcher.cs index 1eb9f44..30b52c7 100644 --- a/src/SharpInference.Engine/MoEPrefetcher.cs +++ b/src/SharpInference.Engine/MoEPrefetcher.cs @@ -47,7 +47,19 @@ private async Task RunAsync() foreach (int expertId in batch.ExpertIds) { if (_cts.Token.IsCancellationRequested) return; - _slotManager.Preload(batch.Layer, expertId); + try + { + _slotManager.Preload(batch.Layer, expertId); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + // A single Preload failure (transient cudaMalloc / upload + // fault / stale predicted expert) must not silently kill + // the worker: prefetch is best-effort, but losing it mid-run + // degrades throughput with no log to grep for. + Console.Error.WriteLine( + $"[MoEPrefetcher] Preload(layer={batch.Layer}, expert={expertId}) failed: {ex.Message}"); + } } } } diff --git a/src/SharpInference.Engine/WarmPinConfig.cs b/src/SharpInference.Engine/WarmPinConfig.cs new file mode 100644 index 0000000..6d36df1 --- /dev/null +++ b/src/SharpInference.Engine/WarmPinConfig.cs @@ -0,0 +1,49 @@ +namespace SharpInference.Engine; + +/// +/// Reads the opt-in warm-pinning configuration from the environment, once. +/// +/// SHARPI_MOE_WARMPIN — number of hottest experts to pin per layer +/// into the GPU expert cache's protected segment. 0 (default) disables +/// warm-pinning entirely, so behaviour is unchanged unless opted in. +/// SHARPI_MOE_WARMPIN_AFTER — number of expert accesses to observe +/// before the warm set is chosen (default 512), so pinning reflects real routing +/// rather than the first few cold tokens. Must be > 0 (any positive value); +/// 0 / negative / malformed are rejected with a stderr message and the default +/// applies. +/// +/// Shared by (Vulkan) and . +/// Malformed values fall back to the default and log a warning to stderr so +/// typos don't silently turn the feature off. +/// +internal static class WarmPinConfig +{ + public static readonly int PerLayer = ParseInt("SHARPI_MOE_WARMPIN", 0, allowZero: true); + public static readonly long AfterAccesses = ParseLong("SHARPI_MOE_WARMPIN_AFTER", 512, allowZero: false); + + private static int ParseInt(string name, int fallback, bool allowZero) + { + var s = Environment.GetEnvironmentVariable(name); + if (string.IsNullOrEmpty(s)) return fallback; + if (!int.TryParse(s, out int v) || v < 0 || (!allowZero && v == 0)) + { + Console.Error.WriteLine( + $"[WarmPinConfig] {name}='{s}' is not a {(allowZero ? "non-negative" : "positive")} integer; using default {fallback}."); + return fallback; + } + return v; + } + + private static long ParseLong(string name, long fallback, bool allowZero) + { + var s = Environment.GetEnvironmentVariable(name); + if (string.IsNullOrEmpty(s)) return fallback; + if (!long.TryParse(s, out long v) || v < 0 || (!allowZero && v == 0)) + { + Console.Error.WriteLine( + $"[WarmPinConfig] {name}='{s}' is not a {(allowZero ? "non-negative" : "positive")} integer; using default {fallback}."); + return fallback; + } + return v; + } +} diff --git a/src/SharpInference.Pipeline/ExpertAccessProfiler.cs b/src/SharpInference.Pipeline/ExpertAccessProfiler.cs index 9652b6e..68a7507 100644 --- a/src/SharpInference.Pipeline/ExpertAccessProfiler.cs +++ b/src/SharpInference.Pipeline/ExpertAccessProfiler.cs @@ -63,6 +63,34 @@ public double GetLayerHitRate(int layer) return (hits + misses) == 0 ? 0.0 : (double)hits / (hits + misses); } + /// + /// Total access count (hits + misses) for one expert. Used as the popularity + /// signal for frequency-aware cache eviction. + /// + public long GetAccessCount(int layer, int expertId) + { + int i = layer * _numExperts + expertId; + return Interlocked.Read(ref _hits[i]) + Interlocked.Read(ref _misses[i]); + } + + /// + /// Total access count for a whole layer (sum across experts). Used to rank + /// layers by hotness so warm-pinning budgets the hottest layers first instead + /// of iterating in layer-index order (which biases pins to low-index layers + /// for hybrid GDN+MoE models that cluster MoE FFN at high indices). + /// + public long GetLayerAccessCount(int layer) + { + long total = 0; + int offset = layer * _numExperts; + for (int e = 0; e < _numExperts; e++) + { + total += Interlocked.Read(ref _hits[offset + e]); + total += Interlocked.Read(ref _misses[offset + e]); + } + return total; + } + /// /// Returns the most-accessed expert IDs for , /// sorted descending by total access count (hits + misses). @@ -94,7 +122,22 @@ public void PrintStats(TextWriter output) for (int layer = 0; layer < _numLayers; layer++) { - double lr = GetLayerHitRate(layer); + long lh = 0, lm = 0; + int offset = layer * _numExperts; + for (int e = 0; e < _numExperts; e++) + { + lh += Interlocked.Read(ref _hits[offset + e]); + lm += Interlocked.Read(ref _misses[offset + e]); + } + // Layers that never report into the profiler (CPU-resident under hybrid + // offload, or non-MoE layers) have zero counts. Skip the bogus 0.0 % + + // arbitrary "top expert" output that would come from sorting equal zeros. + if (lh + lm == 0) + { + sb.AppendLine($" layer {layer,3}: (no GPU SLRU accesses recorded)"); + continue; + } + double lr = (double)lh / (lh + lm); var top = GetTopExperts(layer, 3); string topStr = string.Join(", ", top); sb.AppendLine($" layer {layer,3}: hit {lr:P1} top experts: [{topStr}]"); diff --git a/src/SharpInference.Pipeline/ExpertCache.cs b/src/SharpInference.Pipeline/ExpertCache.cs index c76b998..f838a53 100644 --- a/src/SharpInference.Pipeline/ExpertCache.cs +++ b/src/SharpInference.Pipeline/ExpertCache.cs @@ -17,17 +17,26 @@ public sealed class ExpertCache : IDisposable /// Optional callback invoked with the evicted value so the caller can release /// any associated GPU resources. /// - public ExpertCache(int capacity, Action? onEvict = null) + /// + /// Optional per-(layer, expertId) access-count accessor. When supplied, eviction + /// becomes frequency-aware (least-accessed probationary expert is evicted first), + /// exploiting MoE routing skew. When null, eviction is plain LRU. + /// + public ExpertCache(int capacity, Action? onEvict = null, + Func? frequencyOf = null) { if (capacity <= 0) throw new ArgumentOutOfRangeException(nameof(capacity)); // Split: 25% probationary, 75% protected — biased toward retention since routing is skewed. int probCap = Math.Max(1, capacity / 4); int protCap = Math.Max(1, capacity - probCap); - _slru = new SlruCache<(int, int), T>(probCap, protCap); + Func<(int Layer, int ExpertId), long>? freq = + frequencyOf is null ? null : k => frequencyOf(k.Layer, k.ExpertId); + _slru = new SlruCache<(int, int), T>(probCap, protCap, freq); _onEvict = onEvict; } public int Count => _slru.Count; + public int PinnedCount => _slru.PinnedCount; /// Look up the cached value for (, ). public bool TryGet(int layer, int expertId, out T value) => @@ -47,6 +56,18 @@ public void Put(int layer, int expertId, T value) public bool Contains(int layer, int expertId) => _slru.Contains((layer, expertId)); + /// + /// Pin (, ) so it is never + /// evicted while resident. The expert must already be cached (call + /// first). Use for warm-pinning the hottest experts identified by profiling. + /// + public void Pin(int layer, int expertId) => _slru.Pin((layer, expertId)); + + /// Remove the pin on (, ). + public void Unpin(int layer, int expertId) => _slru.Unpin((layer, expertId)); + + public bool IsPinned(int layer, int expertId) => _slru.IsPinned((layer, expertId)); + /// /// Invoke for every currently-cached value, then clear the cache. /// Use this in dispose paths to release GPU resources without skipping the evict callback. diff --git a/src/SharpInference.Pipeline/ExpertRoutePredictor.cs b/src/SharpInference.Pipeline/ExpertRoutePredictor.cs new file mode 100644 index 0000000..ce1b45a --- /dev/null +++ b/src/SharpInference.Pipeline/ExpertRoutePredictor.cs @@ -0,0 +1,63 @@ +namespace SharpInference.Pipeline; + +/// +/// Predicts which experts an MoE layer will activate for the next token, using the +/// previous token's selection at the same layer. MoE routing has strong cross-token +/// temporal locality at a fixed layer (the same premise that makes the SLRU expert +/// cache effective), so last-token selections are a cheap, training-free predictor +/// — the PreScope / MoE-Infinity style of activation-aware prefetching. +/// +/// +/// The win over reacting to the current layer's own router is lead time: while +/// layer L is still computing, the predictor lets the prefetcher start loading layer +/// L+1's likely experts, hiding the PCIe transfer behind compute. Predictions are only +/// ever prefetch hints — a wrong guess wastes a transfer but never affects output, so +/// no accuracy bound is required. +/// +/// +/// Pure CPU state; not thread-safe (call from the single decode thread). +/// +public sealed class ExpertRoutePredictor +{ + private readonly int _numLayers; + private readonly int _maxActive; + private readonly int[] _experts; // [layer * maxActive + k] + private readonly int[] _counts; // valid entries per layer (0 until first seen) + + public ExpertRoutePredictor(int numLayers, int maxActiveExperts) + { + if (numLayers <= 0) throw new ArgumentOutOfRangeException(nameof(numLayers)); + if (maxActiveExperts <= 0) throw new ArgumentOutOfRangeException(nameof(maxActiveExperts)); + _numLayers = numLayers; + _maxActive = maxActiveExperts; + _experts = new int[numLayers * maxActiveExperts]; + _counts = new int[numLayers]; + } + + /// Record the experts a layer actually selected for the current token. + public void Record(int layer, ReadOnlySpan selected) + { + if ((uint)layer >= (uint)_numLayers) return; + int n = Math.Min(selected.Length, _maxActive); + selected[..n].CopyTo(_experts.AsSpan(layer * _maxActive, n)); + _counts[layer] = n; + } + + /// + /// Predict 's experts for the next token (its previous-token + /// selection). Returns false until the layer has been observed at least once. + /// + public bool TryPredict(int layer, out ReadOnlySpan experts) + { + if ((uint)layer >= (uint)_numLayers || _counts[layer] == 0) + { + experts = default; + return false; + } + experts = _experts.AsSpan(layer * _maxActive, _counts[layer]); + return true; + } + + /// Forget all history (call when starting a new sequence / on cache reset). + public void Reset() => Array.Clear(_counts); +} diff --git a/src/SharpInference.Pipeline/MoeCacheSizing.cs b/src/SharpInference.Pipeline/MoeCacheSizing.cs new file mode 100644 index 0000000..966dad7 --- /dev/null +++ b/src/SharpInference.Pipeline/MoeCacheSizing.cs @@ -0,0 +1,84 @@ +namespace SharpInference.Pipeline; + +/// +/// Sizes the GPU expert SLRU cache for a MoE model. Pure, deterministic, and unit-tested +/// so the policy is verifiable without a GPU. +/// +/// +/// Capacity is bounded below by 1 (the cache must function even if the budget cannot fit +/// a single expert — callers detect this case via ) and +/// above by the total GPU-layer expert count; otherwise it is the most the VRAM budget +/// allows. Separately we compute a from the +/// routing-locality finding of "Not All Models Suit Expert Offloading" (arXiv:2505.16056): +/// a cache of roughly 2 × active-experts per layer covers a token segment well. +/// When the budget forces capacity below that, callers should warn that hit rate may +/// suffer (fewer GPU layers or more VRAM would help) rather than silently underperform. +/// +/// +public static class MoeCacheSizing +{ + public static MoeCachePlan Plan( + int gpuLayers, int numExperts, int numActiveExperts, + long freeVramBytes, long perExpertBytes, long reserveBytes) + { + long total = (long)gpuLayers * numExperts; + if (total <= 0) return new MoeCachePlan(0, 0, 0, MoeCacheSizingStatus.Empty); + + MoeCacheSizingStatus status; + long byBudget; + if (perExpertBytes <= 0) + { + // Caller couldn't size an expert (missing tensor, dtype unknown). Falling + // back to `total` silently would defeat the planner's purpose; surface the + // condition so the caller can decide whether to abort or proceed unbounded. + byBudget = total; + status = MoeCacheSizingStatus.UnknownExpertSize; + } + else + { + byBudget = Math.Max(0, (freeVramBytes - reserveBytes) / perExpertBytes); + if (byBudget == 0) + status = MoeCacheSizingStatus.BudgetExhausted; + else if (byBudget < (long)gpuLayers * Math.Min(numExperts, 2 * numActiveExperts)) + status = MoeCacheSizingStatus.BelowRecommended; + else + status = MoeCacheSizingStatus.Ok; + } + + // Never exceed the total; keep at least one slot so the cache works. Note: + // when byBudget==0 (BudgetExhausted) the clamp raises capacity to 1; the + // Status enum is the caller's only signal that this happened. + int capacity = (int)Math.Clamp(byBudget, 1, total); + + // Locality sweet spot: ~2× active experts per GPU layer (capped at the full set). + long recommended = Math.Min(total, (long)gpuLayers * Math.Min(numExperts, 2 * numActiveExperts)); + + return new MoeCachePlan(capacity, (int)recommended, (int)Math.Min(byBudget, int.MaxValue), status); + } +} + +/// +/// Outcome categories for . +/// +public enum MoeCacheSizingStatus +{ + /// No GPU layers or no experts — nothing to size. + Empty, + /// Budget exceeds the locality recommendation; cache fits the working set. + Ok, + /// Budget fits the cache but below the routing-locality recommendation. + BelowRecommended, + /// VRAM budget cannot fit even one expert; capacity was clamped to 1. + BudgetExhausted, + /// Per-expert size unknown (missing tensor); capacity fell back to total. + UnknownExpertSize, +} + +/// +/// Result of . is the capacity to +/// use; is the locality-based target (warn if Slots is +/// materially below it); is how many slots the VRAM budget +/// alone would allow (for diagnostics); distinguishes +/// "budget fits cache" from the clamped-to-1 and unknown-expert-size edge cases. +/// +public readonly record struct MoeCachePlan(int Slots, int RecommendedSlots, int BudgetSlots, MoeCacheSizingStatus Status); diff --git a/src/SharpInference.Pipeline/SlruCache.cs b/src/SharpInference.Pipeline/SlruCache.cs index 472efb8..6392df1 100644 --- a/src/SharpInference.Pipeline/SlruCache.cs +++ b/src/SharpInference.Pipeline/SlruCache.cs @@ -4,7 +4,21 @@ namespace SharpInference.Pipeline; /// Segmented LRU (SLRU) cache with probationary and protected segments. /// New items enter the probationary segment. Items accessed in probationary /// are promoted to the protected segment, exploiting temporal locality. -/// Eviction always targets the tail of the probationary segment. +/// Eviction targets the probationary segment. +/// +/// Two optional refinements (both default-off, so behaviour is plain SLRU +/// unless configured): +/// +/// Frequency-aware eviction — when a frequencyOf accessor is +/// supplied, the probationary victim is the least-frequently-accessed +/// entry (recency breaks ties), rather than the strict LRU tail. This biases +/// the cache toward keeping hot experts resident under MoE routing skew. The +/// most-recently-inserted entry is never chosen, avoiding the LFU cold-start +/// trap of evicting the item we just loaded. +/// Pinning — pinned keys live in the protected segment and are +/// never evicted or demoted while resident, so a warm set of hot experts can +/// be guaranteed in fast memory. +/// /// public sealed class SlruCache where TKey : notnull { @@ -12,24 +26,33 @@ public sealed class SlruCache where TKey : notnull private readonly int _probCapacity; private readonly int _protCapacity; + private readonly Func? _frequencyOf; private readonly LinkedList _prob = new(); private readonly LinkedList _prot = new(); private readonly Dictionary> _probIndex = new(); private readonly Dictionary> _protIndex = new(); + private readonly HashSet _pinned = new(); public int Count => _prob.Count + _prot.Count; public int ProbationaryCount => _prob.Count; public int ProtectedCount => _prot.Count; + public int PinnedCount => _pinned.Count; /// Slots reserved for newly-inserted (cold) items. /// Slots reserved for promoted (hot) items. - public SlruCache(int probationaryCapacity, int protectedCapacity) + /// + /// Optional access-count accessor enabling frequency-aware eviction. When null, + /// eviction is plain LRU (probationary tail). + /// + public SlruCache(int probationaryCapacity, int protectedCapacity, + Func? frequencyOf = null) { if (probationaryCapacity <= 0) throw new ArgumentOutOfRangeException(nameof(probationaryCapacity)); if (protectedCapacity <= 0) throw new ArgumentOutOfRangeException(nameof(protectedCapacity)); _probCapacity = probationaryCapacity; _protCapacity = protectedCapacity; + _frequencyOf = frequencyOf; } /// Look up . Promotes probationary hits to protected. @@ -46,24 +69,44 @@ public bool TryGet(TKey key, out TValue value) if (_probIndex.TryGetValue(key, out var probNode)) { - _prob.Remove(probNode); - _probIndex.Remove(key); + value = probNode.Value.Value; - // If protected is full, demote its tail to the head of probationary. - // The probationary count went down by one (we removed probNode), so - // adding the demoted entry keeps probationary ≤ _probCapacity. - if (_prot.Count >= _protCapacity) + if (_prot.Count < _protCapacity) + { + // Room in protected: promote directly. + _prob.Remove(probNode); + _probIndex.Remove(key); + _prot.AddFirst(probNode.Value); + _protIndex[key] = _prot.First!; + return true; + } + + // Protected is full: demote the tail-most UNPINNED protected entry to + // make room. The probationary count goes -1 (remove probNode) +1 (demoted + // in) = unchanged, so it stays ≤ _probCapacity. + var demoteNode = LastUnpinnedProtected(); + if (demoteNode is not null) { - var demoted = _prot.Last!.Value; - _prot.RemoveLast(); + _prob.Remove(probNode); + _probIndex.Remove(key); + + var demoted = demoteNode.Value; + _prot.Remove(demoteNode); _protIndex.Remove(demoted.Key); _prob.AddFirst(demoted); _probIndex[demoted.Key] = _prob.First!; - } - _prot.AddFirst(probNode.Value); - _protIndex[key] = _prot.First!; - value = probNode.Value.Value; + _prot.AddFirst(probNode.Value); + _protIndex[key] = _prot.First!; + } + else + { + // Every protected slot is pinned — cannot promote. Refresh recency + // within probationary instead. + _prob.Remove(probNode); + _prob.AddFirst(probNode.Value); + _probIndex[key] = _prob.First!; + } return true; } @@ -71,6 +114,14 @@ public bool TryGet(TKey key, out TValue value) return false; } + /// Tail-most protected node that is not pinned, or null if all are pinned. + private LinkedListNode? LastUnpinnedProtected() + { + for (var node = _prot.Last; node is not null; node = node.Previous) + if (!_pinned.Contains(node.Value.Key)) return node; + return null; + } + /// /// Insert into the probationary segment. /// If insertion causes the probationary segment to exceed capacity, the LRU tail is evicted @@ -84,11 +135,12 @@ public bool Put(TKey key, TValue value, out TKey evictedKey, out TValue evictedV if (_prob.Count > _probCapacity) { - var victim = _prob.Last!.Value; - _probIndex.Remove(victim.Key); - _prob.RemoveLast(); - evictedKey = victim.Key; - evictedValue = victim.Value; + var victim = SelectProbationaryVictim(); + _probIndex.Remove(victim.Value.Key); + _prob.Remove(victim); + _pinned.Remove(victim.Value.Key); // defensive; pinned entries live in protected + evictedKey = victim.Value.Key; + evictedValue = victim.Value.Value; return true; } @@ -97,6 +149,80 @@ public bool Put(TKey key, TValue value, out TKey evictedKey, out TValue evictedV return false; } + /// + /// Choose which probationary entry to evict. Never the most-recently-inserted + /// entry () and never a pinned entry. With a + /// frequency accessor, picks the least-frequently-accessed candidate (older + /// entry breaks ties); otherwise the LRU tail. + /// + private LinkedListNode SelectProbationaryVictim() + { + // Walk from tail (oldest) toward head, skipping the just-inserted head and + // any pinned entries. + LinkedListNode? best = null; + long bestFreq = long.MaxValue; + for (var node = _prob.Last; node is not null && node != _prob.First; node = node.Previous) + { + if (_pinned.Contains(node.Value.Key)) continue; + if (_frequencyOf is null) + return node; // first unpinned from the tail == LRU victim + + long freq = _frequencyOf(node.Value.Key); + if (freq < bestFreq) + { + bestFreq = freq; + best = node; + } + } + // Fallback when every other entry is pinned: evict the tail regardless. + return best ?? _prob.Last!; + } + + /// + /// Pin so it is never evicted or demoted while resident. + /// Pinned entries are moved into the protected segment. No-op if the key is not + /// currently resident (load it via first) or already pinned. + /// + public void Pin(TKey key) + { + if (_pinned.Contains(key)) return; + + if (_protIndex.ContainsKey(key)) + { + _pinned.Add(key); + return; + } + + if (_probIndex.TryGetValue(key, out var node)) + { + _prob.Remove(node); + _probIndex.Remove(key); + // Demote an unpinned protected tail entry if protected is full so the + // pinned entry has a home; if all are pinned it simply grows protected + // (bounded by the caller pinning ≤ protected capacity). + if (_prot.Count >= _protCapacity) + { + var demote = LastUnpinnedProtected(); + if (demote is not null) + { + var demoted = demote.Value; + _prot.Remove(demote); + _protIndex.Remove(demoted.Key); + _prob.AddFirst(demoted); + _probIndex[demoted.Key] = _prob.First!; + } + } + _prot.AddFirst(node.Value); + _protIndex[key] = _prot.First!; + _pinned.Add(key); + } + } + + /// Remove the pin on (entry stays resident in protected). + public void Unpin(TKey key) => _pinned.Remove(key); + + public bool IsPinned(TKey key) => _pinned.Contains(key); + public bool Contains(TKey key) => _protIndex.ContainsKey(key) || _probIndex.ContainsKey(key); @@ -110,5 +236,6 @@ public void Clear() _probIndex.Clear(); _prot.Clear(); _protIndex.Clear(); + _pinned.Clear(); } } diff --git a/tests/SharpInference.Tests.Pipeline/PipelineTests.cs b/tests/SharpInference.Tests.Pipeline/PipelineTests.cs index 98469b8..bf04ef7 100644 --- a/tests/SharpInference.Tests.Pipeline/PipelineTests.cs +++ b/tests/SharpInference.Tests.Pipeline/PipelineTests.cs @@ -101,5 +101,299 @@ public void ExpertAccessProfiler_TopExperts_OrderedByAccess() Assert.Equal(3, top[0]); // most accesses (3) Assert.Equal(2, top[1]); // second most accesses (2) } + + [Fact] + public void ExpertAccessProfiler_GetAccessCount_SumsHitsAndMisses() + { + var profiler = new SharpInference.Pipeline.ExpertAccessProfiler(numLayers: 2, numExperts: 4); + profiler.RecordHit(1, 2); + profiler.RecordHit(1, 2); + profiler.RecordMiss(1, 2); + Assert.Equal(3, profiler.GetAccessCount(1, 2)); + Assert.Equal(0, profiler.GetAccessCount(0, 2)); // different layer untouched + } + + // ── Frequency-aware eviction ─────────────────────────────────────────── + + [Fact] + public void SlruCache_FrequencyAware_EvictsLeastAccessed_NotLruTail() + { + // freq accessor: key 1 is hot, keys 2 and 3 are cold. + var freq = new System.Collections.Generic.Dictionary { [1] = 100, [2] = 1, [3] = 1 }; + var cache = new SharpInference.Pipeline.SlruCache( + probationaryCapacity: 3, protectedCapacity: 1, frequencyOf: k => freq.GetValueOrDefault(k)); + cache.Put(1, "hot", out _, out _); // tail-most (oldest) but highest freq + cache.Put(2, "cold-a", out _, out _); + cache.Put(3, "cold-b", out _, out _); + // Insert a 4th → probationary overflows. Plain LRU would evict key 1 (oldest); + // frequency-aware keeps the hot key 1 and evicts the least-accessed older entry (key 2). + bool evicted = cache.Put(4, "new", out int evKey, out _); + Assert.True(evicted); + Assert.Equal(2, evKey); + Assert.True(cache.Contains(1)); // hot survived despite being oldest + } + + [Fact] + public void SlruCache_FrequencyAware_NeverEvictsJustInsertedEntry() + { + // The just-inserted entry has frequency 0 (coldest) but must not be evicted. + var freq = new System.Collections.Generic.Dictionary { [1] = 5, [2] = 5 }; + var cache = new SharpInference.Pipeline.SlruCache( + probationaryCapacity: 2, protectedCapacity: 1, frequencyOf: k => freq.GetValueOrDefault(k)); + cache.Put(1, "a", out _, out _); + cache.Put(2, "b", out _, out _); + bool evicted = cache.Put(99, "fresh", out int evKey, out _); // freq(99)=0 + Assert.True(evicted); + Assert.NotEqual(99, evKey); // the fresh insert is protected from immediate eviction + Assert.True(cache.Contains(99)); + } + + // ── Pinning ──────────────────────────────────────────────────────────── + + [Fact] + public void SlruCache_Pin_MovesToProtectedAndSurvivesEviction() + { + var cache = new SharpInference.Pipeline.SlruCache(probationaryCapacity: 2, protectedCapacity: 2); + cache.Put(1, "pinme", out _, out _); + cache.Pin(1); + Assert.True(cache.IsPinned(1)); + Assert.Equal(1, cache.ProtectedCount); // pinning moved it to protected + Assert.Equal(1, cache.PinnedCount); + + // Churn probationary hard; pinned key 1 must never be evicted. + for (int k = 10; k < 30; k++) + cache.Put(k, $"v{k}", out _, out _); + Assert.True(cache.Contains(1)); + } + + [Fact] + public void SlruCache_Pin_NotEvictedAndNotChosenAsVictim() + { + var cache = new SharpInference.Pipeline.SlruCache(probationaryCapacity: 2, protectedCapacity: 1); + cache.Put(1, "a", out _, out _); + cache.Pin(1); // → protected, pinned + cache.Put(2, "b", out _, out _); + cache.Put(3, "c", out _, out _); + bool evicted = cache.Put(4, "d", out int evKey, out _); // prob overflow (2,3,4) → evict one + Assert.True(evicted); + Assert.NotEqual(1, evKey); // pinned never the victim + Assert.True(cache.Contains(1)); + } + + [Fact] + public void SlruCache_Unpin_AllowsEvictionAgain() + { + var cache = new SharpInference.Pipeline.SlruCache(probationaryCapacity: 1, protectedCapacity: 1); + cache.Put(1, "a", out _, out _); + cache.Pin(1); + Assert.True(cache.IsPinned(1)); + cache.Unpin(1); + Assert.False(cache.IsPinned(1)); + Assert.Equal(0, cache.PinnedCount); + } + + [Fact] + public void SlruCache_Pin_NonResidentKey_IsNoOp() + { + var cache = new SharpInference.Pipeline.SlruCache(probationaryCapacity: 2, protectedCapacity: 2); + cache.Pin(42); // not resident + Assert.False(cache.IsPinned(42)); + Assert.Equal(0, cache.PinnedCount); + } + + [Fact] + public void ExpertCache_Pin_KeepsHotExpertResidentUnderChurn() + { + var evicted = new System.Collections.Generic.List(); + var cache = new SharpInference.Pipeline.ExpertCache(capacity: 4, onEvict: evicted.Add); + cache.Put(0, 7, "hot"); + cache.Pin(0, 7); + Assert.True(cache.IsPinned(0, 7)); + for (int e = 100; e < 130; e++) + cache.Put(0, e, $"e{e}"); + Assert.True(cache.TryGet(0, 7, out var v)); + Assert.Equal("hot", v); + Assert.DoesNotContain("hot", evicted); + cache.Dispose(); + } + + [Fact] + public void ExpertCache_FrequencyAware_EvictsLeastAccessedExpert() + { + var accesses = new System.Collections.Generic.Dictionary<(int, int), long> + { + [(0, 1)] = 50, [(0, 2)] = 1, [(0, 3)] = 1, + }; + string? evicted = null; + var cache = new SharpInference.Pipeline.ExpertCache( + capacity: 8, onEvict: v => evicted = v, + frequencyOf: (l, e) => accesses.GetValueOrDefault((l, e))); + // capacity 8 → probCap=2, protCap=6, so "hot" and "cold-a" both fit in + // probationary; inserting "cold-b" overflows and must evict the least-accessed + // non-head entry — "cold-a" (freq 1), not "hot" (freq 50). With probCap=1 the + // head-exclusion alone would force the eviction, never exercising the freq path. + cache.Put(0, 1, "hot"); + cache.Put(0, 2, "cold-a"); + cache.Put(0, 3, "cold-b"); + Assert.Equal("cold-a", evicted); // least-accessed evicted, not the hot expert + Assert.True(cache.Contains(0, 1)); // hot is retained + } + + // ── Predictive prefetch: ExpertRoutePredictor ─────────────────────────── + + [Fact] + public void ExpertRoutePredictor_UnseenLayer_PredictsNothing() + { + var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 4, maxActiveExperts: 8); + Assert.False(p.TryPredict(0, out _)); + } + + [Fact] + public void ExpertRoutePredictor_RecallsLastSelection() + { + var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 4, maxActiveExperts: 8); + p.Record(2, stackalloc int[] { 5, 9, 13 }); + Assert.True(p.TryPredict(2, out var pred)); + Assert.Equal(new[] { 5, 9, 13 }, pred.ToArray()); + Assert.False(p.TryPredict(3, out _)); // independent per layer + } + + [Fact] + public void ExpertRoutePredictor_LatestRecordWins() + { + var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 2, maxActiveExperts: 4); + p.Record(0, stackalloc int[] { 1, 2 }); + p.Record(0, stackalloc int[] { 7, 8, 9 }); // next token's selection replaces + Assert.True(p.TryPredict(0, out var pred)); + Assert.Equal(new[] { 7, 8, 9 }, pred.ToArray()); + } + + [Fact] + public void ExpertRoutePredictor_Reset_ClearsHistory() + { + var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 2, maxActiveExperts: 4); + p.Record(1, stackalloc int[] { 3 }); + p.Reset(); + Assert.False(p.TryPredict(1, out _)); + } + + [Fact] + public void ExpertRoutePredictor_ClampsToMaxActive() + { + var p = new SharpInference.Pipeline.ExpertRoutePredictor(numLayers: 1, maxActiveExperts: 2); + p.Record(0, stackalloc int[] { 4, 5, 6, 7 }); // more than maxActive + Assert.True(p.TryPredict(0, out var pred)); + Assert.Equal(2, pred.Length); + Assert.Equal(new[] { 4, 5 }, pred.ToArray()); + } + + // ── Model-aware cache sizing: MoeCacheSizing ──────────────────────────── + + [Fact] + public void MoeCacheSizing_AmpleVram_CapsAtTotalExperts() + { + // 8 layers × 64 experts = 512; ample free VRAM → capacity == total. + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20); + Assert.Equal(512, plan.Slots); + Assert.True(plan.BudgetSlots >= 512); + } + + [Fact] + public void MoeCacheSizing_TightVram_NeverExceedsBudget() + { + // Only ~50 experts' worth of VRAM free → capacity bounded by budget, not total. + long perExpert = 2L << 20; + long free = (512L << 20) + 50 * perExpert; // reserve + 50 experts + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20); + Assert.Equal(50, plan.Slots); // exactly what fits + Assert.True(plan.Slots <= plan.BudgetSlots); + } + + [Fact] + public void MoeCacheSizing_RecommendedIsTwiceActivePerLayer() + { + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20); + Assert.Equal(8 * 2 * 8, plan.RecommendedSlots); // 8 layers × 2×8 active = 128 + } + + [Fact] + public void MoeCacheSizing_TightVram_FlagsBelowRecommended() + { + long perExpert = 2L << 20; + long free = (512L << 20) + 50 * perExpert; + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20); + Assert.True(plan.Slots < plan.RecommendedSlots); // 50 < 128 → caller warns + } + + [Fact] + public void MoeCacheSizing_RecommendedCappedAtTotal_WhenFewExperts() + { + // 2× active (16) exceeds numExperts (8) → recommended per layer capped at numExperts. + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 4, numExperts: 8, numActiveExperts: 8, + freeVramBytes: 100L << 30, perExpertBytes: 1L << 20, reserveBytes: 0); + Assert.Equal(4 * 8, plan.RecommendedSlots); // capped at total, not 4×16 + } + + [Fact] + public void MoeCacheSizing_ZeroLayers_ReturnsZero() + { + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan(0, 64, 8, 1L << 30, 1L << 20, 0); + Assert.Equal(0, plan.Slots); + Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.Empty, plan.Status); + } + + [Fact] + public void MoeCacheSizing_AmpleVram_StatusIsOk() + { + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: 100L << 30, perExpertBytes: 2L << 20, reserveBytes: 512L << 20); + Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.Ok, plan.Status); + } + + [Fact] + public void MoeCacheSizing_TightVram_StatusIsBelowRecommended() + { + long perExpert = 2L << 20; + long free = (512L << 20) + 50 * perExpert; + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: free, perExpertBytes: perExpert, reserveBytes: 512L << 20); + Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.BelowRecommended, plan.Status); + } + + [Fact] + public void MoeCacheSizing_BudgetExhausted_ClampsToOneAndFlagsStatus() + { + // Reserve consumes the entire free VRAM → budget = 0 → clamped to 1. + // Caller must see the BudgetExhausted status to act on it. + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: 512L << 20, perExpertBytes: 2L << 20, reserveBytes: 512L << 20); + Assert.Equal(1, plan.Slots); + Assert.Equal(0, plan.BudgetSlots); + Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.BudgetExhausted, plan.Status); + } + + [Fact] + public void MoeCacheSizing_UnknownExpertSize_FlagsStatus() + { + // perExpertBytes == 0 (caller couldn't measure) → cache falls back to total. + // Status flag is the only way the caller can distinguish this from Ok. + var plan = SharpInference.Pipeline.MoeCacheSizing.Plan( + gpuLayers: 8, numExperts: 64, numActiveExperts: 8, + freeVramBytes: 100L << 30, perExpertBytes: 0, reserveBytes: 0); + Assert.Equal(8 * 64, plan.Slots); + Assert.Equal(SharpInference.Pipeline.MoeCacheSizingStatus.UnknownExpertSize, plan.Status); + } }