From 0d3a7a71d337aad51a09c6b0d992750e0ddb79b7 Mon Sep 17 00:00:00 2001
From: Pekka Heikura <pekkah@gmail.com>
Date: Mon, 15 Jun 2026 15:22:57 +0300
Subject: [PATCH 1/2] perf(engine): real per-page mmap pre-fault for
 CPU-resident weights (#221)

The hybrid "pre-fault" only touched 2 pages per tensor (DataPtr[0] +
DataPtr[size-1]), and CudaHybridGdnForwardPass had none -- so the first
request on CPU-MoE configs faulted every expert page on the critical
path, ~5x slower than warm (the #210 bench protocol's "run each cell
twice" workaround exists for exactly this).

New shared MmapPrefault helper: a pure, testable gate (SHARPI_PREFAULT
0=off / 1=force, plus an 80%-of-RAM fit heuristic), best-effort OS
read-ahead (PrefetchVirtualMemory on Windows, posix_madvise(WILLNEED)
on Linux, both LibraryImport), then a parallel per-page stride read
(32 MiB chunks) that guarantees residency and reports GiB/s.

Wired into the three hybrid classes via BuildCpuPrefaultRegions(), now
covering the full CPU-resident set the old gate missed -- the CPU-MoE
routed experts and Gemma 4 PLE table, which are mmap-resident even at
-g -1 (_nCpuLayers == 0). Embedding/output are skipped when GPU-resident.
The pre-existing full-page prefaults in ForwardPass and HybridGdn-
ForwardPass now honour the same SHARPI_PREFAULT=0 kill switch.

Verified on a 4070 Ti: CudaHybridForwardPass warms 16.39 GiB of Coder-30B
CPU-MoE experts; CudaHybridGdnForwardPass (no prefault before) warms
14.66 GiB of Carnice experts (0.5 GiB/s cold read = ~29s of fault I/O
moved off the request path). Decode unchanged (prefault is read-only).
13 unit tests in MmapPrefaultTests.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../CudaHybridForwardPass.cs                  | 101 +++++----
 .../CudaHybridGdnForwardPass.cs               |  41 ++++
 src/SharpInference.Engine/ForwardPass.cs      |   3 +-
 .../HybridForwardPass.cs                      |  97 +++++----
 .../HybridGdnForwardPass.cs                   |   3 +-
 src/SharpInference.Engine/MmapPrefault.cs     | 204 ++++++++++++++++++
 .../MmapPrefaultTests.cs                      | 202 +++++++++++++++++
 7 files changed, 568 insertions(+), 83 deletions(-)
 create mode 100644 src/SharpInference.Engine/MmapPrefault.cs
 create mode 100644 tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs
index fcea42b..14479ba 100644
--- a/src/SharpInference.Engine/CudaHybridForwardPass.cs
+++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs
@@ -930,48 +930,12 @@ void TraceVram(string label)
             _gpuRopeFreqs = UploadWeight("rope_freqs.weight");
         }
 
-        // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
-        // to ensure OS pages them into RAM before the first forward pass.
-        if (_nCpuLayers > 0)
-        {
-            Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
-            long touchSum = 0;
-            IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
-            if (_isMoE)
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGateInp!)
-                    .Concat(_cpuWGateExps!)
-                    .Concat(_cpuWUpExps!)
-                    .Concat(_cpuWDownExps!);
-                if (_hasSharedExpert)
-                {
-                    weightsToTouch = weightsToTouch
-                        .Concat(_cpuWGateShexp!)
-                        .Concat(_cpuWUpShexp!)
-                        .Concat(_cpuWDownShexp!);
-                }
-            }
-            else
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGate)
-                    .Concat(_cpuWUp)
-                    .Concat(_cpuWDown);
-            }
-
-            foreach (var wRef in weightsToTouch)
-            {
-                // Skip un-resolved slots — KV-share layers on Gemma 4 leave attn_k /
-                // attn_v unresolved by design (the source layer's projections are
-                // reused via the alias dispatch).
-                if (wRef.DataPtr == null) continue;
-                touchSum += wRef.DataPtr[0];
-                long size = wRef.Info.ByteSize;
-                if (size > 64) touchSum += wRef.DataPtr[size - 1];
-            }
-            Console.Error.WriteLine($" done. (touch={touchSum})");
-        }
+        // Pre-fault every CPU-resident mmap weight page so the first request doesn't
+        // stall on demand paging (issue #221). NOT gated on _nCpuLayers > 0: the
+        // CPU-MoE routed experts and the Gemma 4 PLE table are CPU-resident even when
+        // every transformer layer is GPU-offloaded (-g -1), and those are the dominant
+        // cold-start cost. MmapPrefault filters empty configs and honours SHARPI_PREFAULT.
+        MmapPrefault.Run("CudaHybridForwardPass", BuildCpuPrefaultRegions());
 
         if (_tqEnabled && _nCpuLayers > 0)
         {
@@ -2783,6 +2747,59 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep. Covers per-CPU-layer weights, the CPU-MoE routed experts (the
+    /// big cold-start cost, present even with all layers GPU-offloaded), and the
+    /// Gemma 4 PLE table. Unresolved slots (null <c>DataPtr</c>) — e.g. Gemma 4
+    /// KV-share / k==v layers — are skipped. Biases and QK-norms are excluded: they're
+    /// dequantized into separate buffers, not read from the mmap at inference time.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+
+        // Embedding/output mmap refs are read at inference only when they're NOT
+        // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
+        // GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
+        if (_gpuEmbedding is null) Add1(_cpuEmbedding);
+        if (_gpuOutputWeight is null)
+        {
+            Add1(_cpuOutputNorm);
+            if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
+        }
+
+        Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo);
+        Add(_cpuFfnNorm); Add(_cpuPostAttnNorm); Add(_cpuPostFfwNorm);
+
+        if (_isMoE)
+        {
+            Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
+            if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }
+        }
+        else
+        {
+            Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
+        }
+
+        // CPU-MoE routed experts for the GPU-trunk layers (the -g -1 cold-start cost).
+        Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps);
+
+        // Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections.
+        if (_pleTokenEmbed is { } ple) Add1(ple);
+        if (_perLayerProjNorm is { } pln) Add1(pln);
+        Add(_cpuInpGate); Add(_cpuPleProj); Add(_cpuPlePostNorm);
+
+        return regions;
+    }
+
     private float* LoadCpuBias(string name, int count)
     {
         var info = _model.FindTensor(name)
diff --git a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
index c5e4794..e97d939 100644
--- a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
+++ b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
@@ -1414,6 +1414,13 @@ void TraceVram(string label)
             _logitsBuf2 = Array.Empty<float>();
             _cpuNormBuf2 = _cpuMoeHidden2 = _lastHiddenT1 = null;
         }
+
+        // Pre-fault CPU-resident mmap weight pages (issue #221). On the CPU-MoE config
+        // (the auto-selected winner on 12 GB) the routed experts / dense FFN weights are
+        // paged in lazily; without this the first request faults them all on the critical
+        // path, ~5× slower than warm. MmapPrefault honours SHARPI_PREFAULT and the
+        // RAM-fit heuristic, and no-ops when nothing is CPU-resident (full-GPU GDN).
+        MmapPrefault.Run("CudaHybridGdnForwardPass", BuildCpuPrefaultRegions());
     }
 
     // =================================================================
@@ -5231,6 +5238,40 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep: the CPU-MoE routed experts (or dense FFN weights), the
+    /// SHARPI_CPU_GDN debug GDN weights, and the MoE-MTP head experts. Arrays that
+    /// aren't allocated for this config are null; unpopulated slots (e.g. the GDN
+    /// arrays when not in CPU-GDN mode) have a null <c>DataPtr</c> and are skipped.
+    /// Everything dequantized via LoadF32Tensor/LoadConv1d lives in separate buffers,
+    /// not the mmap, and is excluded.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+
+        // Trunk: CPU-MoE routed experts, or dense FFN weights (Qwen3.6-27B-MTP).
+        Add(_cpuFfnGateInp); Add(_cpuFfnGateExps); Add(_cpuFfnUpExps); Add(_cpuFfnDownExps);
+        Add(_cpuWFfnGate); Add(_cpuWFfnUp); Add(_cpuWFfnDown);
+
+        // SHARPI_CPU_GDN=1 debug path (arrays always allocated, populated only then).
+        Add(_cpuWQkv); Add(_cpuWZGate); Add(_cpuSsmOut); Add(_cpuSsmAlpha); Add(_cpuSsmBeta);
+
+        // MoE-MTP head routed experts (one extra layer; null DataPtr when absent).
+        Add1(_cpuMtpFfnGateInp); Add1(_cpuMtpFfnGateExps);
+        Add1(_cpuMtpFfnUpExps); Add1(_cpuMtpFfnDownExps);
+
+        return regions;
+    }
+
     private Tensor UploadWeight(string name)
     {
         var info = _model.FindTensor(name)
diff --git a/src/SharpInference.Engine/ForwardPass.cs b/src/SharpInference.Engine/ForwardPass.cs
index 3735318..c5d099f 100644
--- a/src/SharpInference.Engine/ForwardPass.cs
+++ b/src/SharpInference.Engine/ForwardPass.cs
@@ -468,7 +468,8 @@ static long F32Bytes(in TensorRef t) =>
         _dequantCacheCovers = _dequantCacheEnabled && fullF32Bytes > 0
             && _dequantCacheBudgetBytes >= fullF32Bytes;
 
-        PrefaultWeights();
+        // SHARPI_PREFAULT=0 is the global kill switch (issue #221).
+        if (!MmapPrefault.IsDisabled()) PrefaultWeights();
     }
 
     /// <summary>
diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs
index d0f8c0e..7ec3712 100644
--- a/src/SharpInference.Engine/HybridForwardPass.cs
+++ b/src/SharpInference.Engine/HybridForwardPass.cs
@@ -461,45 +461,11 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp
 
         _cpuKvCache = new KvCache(_nCpuLayers, _maxSeqLen, _numKvHeads, _headDim);
 
-        // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
-        // to ensure OS pages them into RAM before the first forward pass.
-        if (_nCpuLayers > 0)
-        {
-            Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
-            long touchSum = 0;
-            IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
-            if (_isMoE)
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGateInp!)
-                    .Concat(_cpuWGateExps!)
-                    .Concat(_cpuWUpExps!)
-                    .Concat(_cpuWDownExps!);
-                if (_hasSharedExpert)
-                {
-                    weightsToTouch = weightsToTouch
-                        .Concat(_cpuWGateShexp!)
-                        .Concat(_cpuWUpShexp!)
-                        .Concat(_cpuWDownShexp!);
-                }
-            }
-            else
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGate)
-                    .Concat(_cpuWUp)
-                    .Concat(_cpuWDown);
-            }
-
-            foreach (var wRef in weightsToTouch)
-            {
-                // Touch first and last cache line of each weight tensor
-                touchSum += wRef.DataPtr[0];
-                long size = wRef.Info.ByteSize;
-                if (size > 64) touchSum += wRef.DataPtr[size - 1];
-            }
-            Console.Error.WriteLine($" done. (touch={touchSum})");
-        }
+        // Pre-fault every CPU-resident mmap weight page so the first request doesn't
+        // stall on demand paging (issue #221). The CPU embedding/output tensors are
+        // resolved even on a pure-GPU split (CPU embed lookup / lm_head), so the sweep
+        // is not gated on _nCpuLayers > 0. MmapPrefault honours SHARPI_PREFAULT.
+        MmapPrefault.Run("HybridForwardPass", BuildCpuPrefaultRegions());
 
         if (_tqEnabled && _nCpuLayers > 0)
         {
@@ -1165,6 +1131,59 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep: the CPU embedding/output tensors plus the per-CPU-layer
+    /// attention/FFN weights (dense or MoE + shared experts). Biases and QK-norms are
+    /// excluded — they're dequantized into separate buffers, not read from the mmap.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+
+        // Embedding/output mmap refs are read at inference only when they're NOT
+        // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
+        // GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
+        if (_gpuEmbedding is null) Add1(_cpuEmbedding);
+        if (_gpuOutputWeight is null)
+        {
+            Add1(_cpuOutputNorm);
+            if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
+        }
+
+        Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo); Add(_cpuFfnNorm);
+
+        if (_isMoE)
+        {
+            Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
+            if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }
+
+            // GPU-trunk routed experts live in the GPU SLRU cache, but every cache miss
+            // spills to GpuMoeFfnCpuFallback, which reads blk.{0..nGpu-1}.ffn_*_exps
+            // straight from the mmap on the CPU. Fault those too so first-token misses
+            // don't stall (mirrors the CUDA class's _cpuMoe* coverage).
+            for (int li = 0; li < _nGpuLayers; li++)
+            {
+                Add1(ResolveCpuWeight($"blk.{li}.ffn_gate_exps.weight"));
+                Add1(ResolveCpuWeight($"blk.{li}.ffn_up_exps.weight"));
+                Add1(ResolveCpuWeight($"blk.{li}.ffn_down_exps.weight"));
+            }
+        }
+        else
+        {
+            Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
+        }
+
+        return regions;
+    }
+
     private float* LoadCpuBias(string name, int count)
     {
         var info = _model.FindTensor(name)
diff --git a/src/SharpInference.Engine/HybridGdnForwardPass.cs b/src/SharpInference.Engine/HybridGdnForwardPass.cs
index e7bdaba..0532fd8 100644
--- a/src/SharpInference.Engine/HybridGdnForwardPass.cs
+++ b/src/SharpInference.Engine/HybridGdnForwardPass.cs
@@ -673,7 +673,8 @@ public HybridGdnForwardPass(GgufModel model, IComputeBackend backend, ModelHyper
             }
         }
 
-        PrefaultWeights();
+        // SHARPI_PREFAULT=0 is the global kill switch (issue #221).
+        if (!MmapPrefault.IsDisabled()) PrefaultWeights();
     }
 
     // ============================================================
diff --git a/src/SharpInference.Engine/MmapPrefault.cs b/src/SharpInference.Engine/MmapPrefault.cs
new file mode 100644
index 0000000..f333a44
--- /dev/null
+++ b/src/SharpInference.Engine/MmapPrefault.cs
@@ -0,0 +1,204 @@
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace SharpInference.Engine;
+
+/// <summary>
+/// Forces memory-mapped, CPU-resident weight pages resident <i>before</i> the first
+/// forward pass, so the first request does not stall on demand paging (issue #221).
+///
+/// <para>The mmap'd GGUF is paged in lazily by the OS: without a warm-up, the first
+/// request faults every weight page one at a time on the critical path, running ~5×
+/// slower than warm steady-state on CPU-MoE configs (the same penalty #210's bench
+/// protocol works around by running every cell twice). A parallel sequential sweep
+/// pulls the same bytes in at memory/SSD bandwidth — a 15-20 GiB expert region warms
+/// in seconds at NVMe speeds.</para>
+///
+/// <para>Two-step warm-up: (1) a best-effort OS read-ahead hint
+/// (<c>PrefetchVirtualMemory</c> on Windows, <c>posix_madvise(WILLNEED)</c> on Linux),
+/// which coalesces the I/O; (2) a parallel per-page stride read that <i>guarantees</i>
+/// residency and gives the wall-clock measurement. Correctness never depends on the
+/// hint succeeding — the stride read is the source of truth.</para>
+///
+/// <para>Tunable via <c>SHARPI_PREFAULT</c>: <c>0</c> disables all prefaulting,
+/// <c>1</c> forces it on (bypassing the RAM-fit heuristic). Unset = auto.</para>
+/// </summary>
+internal static unsafe partial class MmapPrefault
+{
+    /// <summary>RAM-fit policy applied in auto mode (env unset).</summary>
+    internal enum RamGate
+    {
+        /// <summary>Skip when the mapped set exceeds ~80% of available RAM — it would
+        /// just thrash. Used by the GPU-offload hybrid passes, whose CPU-resident
+        /// weights are a subset of the model that should comfortably fit.</summary>
+        FitsInRam,
+
+        /// <summary>Always sweep (subject only to the <c>SHARPI_PREFAULT=0</c> kill
+        /// switch). Used by the fully-CPU-resident passes, where the user has already
+        /// chosen to run the whole model from RAM and prefaulting is the point.</summary>
+        Always,
+    }
+
+    internal readonly record struct Result(bool Ran, long Bytes, double Seconds, string Reason);
+
+    /// <summary>True when <c>SHARPI_PREFAULT=0</c> disables all prefaulting. Lets the
+    /// fully-CPU-resident passes share the one global kill switch without going through
+    /// <see cref="Run"/>.</summary>
+    internal static bool IsDisabled() =>
+        Environment.GetEnvironmentVariable("SHARPI_PREFAULT") == "0";
+
+    /// <summary>
+    /// Pure gating decision, factored out so it can be unit-tested without touching
+    /// memory or the environment. <paramref name="mode"/> is the raw
+    /// <c>SHARPI_PREFAULT</c> value (null = unset).
+    /// </summary>
+    internal static bool ShouldRun(string? mode, long totalBytes, long availRamBytes,
+        RamGate gate, out string reason)
+    {
+        if (totalBytes <= 0) { reason = "no mapped weights"; return false; }
+        if (mode == "0") { reason = "disabled (SHARPI_PREFAULT=0)"; return false; }
+        if (mode == "1") { reason = "forced (SHARPI_PREFAULT=1)"; return true; }
+        if (gate == RamGate.FitsInRam && availRamBytes > 0 && totalBytes > availRamBytes / 10 * 8)
+        {
+            reason = $"skipped: {totalBytes >> 20} MiB mapped exceeds 80% of "
+                   + $"{availRamBytes >> 20} MiB RAM (set SHARPI_PREFAULT=1 to force)";
+            return false;
+        }
+        reason = "auto";
+        return true;
+    }
+
+    /// <summary>
+    /// Pre-fault the given CPU-resident mmap regions. Null/zero-size regions are
+    /// ignored, so callers can pass their whole weight set and let the helper filter.
+    /// Logs the warmed size and rate (or the skip reason) to <see cref="Console.Error"/>.
+    /// </summary>
+    internal static Result Run(string label, List<(nint Ptr, long Bytes)> regions,
+        RamGate gate = RamGate.FitsInRam)
+    {
+        long total = 0;
+        foreach (var (ptr, bytes) in regions)
+            if (ptr != 0 && bytes > 0) total += bytes;
+
+        string? mode = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        long avail = GC.GetGCMemoryInfo().TotalAvailableMemoryBytes;
+        if (!ShouldRun(mode, total, avail, gate, out string reason))
+        {
+            // Only announce a deliberate skip — staying silent on "nothing to do".
+            if (total > 0) Console.Error.WriteLine($"[{label}] Pre-fault {reason}.");
+            return new Result(false, total, 0, reason);
+        }
+
+        long t0 = Stopwatch.GetTimestamp();
+        TryAdvise(regions);                  // best-effort OS read-ahead hint
+        long touchSum = StrideRead(regions); // guaranteed residency + measurement
+        double secs = (Stopwatch.GetTimestamp() - t0) / (double)Stopwatch.Frequency;
+
+        double gib = total / (1024.0 * 1024 * 1024);
+        double rate = secs > 1e-3 ? gib / secs : 0;
+        Console.Error.WriteLine(
+            $"[{label}] Pre-faulted {gib:F2} GiB of CPU-resident weights in {secs:F1}s ({rate:F1} GiB/s).");
+        // Defeat dead-code elimination of the reads (touchSum is otherwise unused).
+        if (touchSum == long.MinValue) Console.Error.Write(touchSum);
+        return new Result(true, total, secs, reason);
+    }
+
+    /// <summary>Parallel per-page stride read. Large regions are split into fixed-size
+    /// chunks so a handful of multi-GiB expert tensors still spread across all cores
+    /// rather than bottlenecking one thread per tensor.</summary>
+    private static long StrideRead(List<(nint Ptr, long Bytes)> regions)
+    {
+        int pageSize = Environment.SystemPageSize;
+        const long chunk = 32L * 1024 * 1024;
+        var jobs = new List<(nint Ptr, long Start, long End)>();
+        foreach (var (ptr, bytes) in regions)
+        {
+            if (ptr == 0 || bytes <= 0) continue;
+            for (long s = 0; s < bytes; s += chunk)
+                jobs.Add((ptr, s, Math.Min(s + chunk, bytes)));
+        }
+
+        long touchSum = 0;
+        Parallel.ForEach(jobs, job =>
+        {
+            byte* p = (byte*)job.Ptr;
+            long localSum = 0;
+            for (long off = job.Start; off < job.End; off += pageSize)
+                localSum += p[off];
+            localSum += p[job.End - 1]; // tail page of this chunk
+            Interlocked.Add(ref touchSum, localSum);
+        });
+        return touchSum;
+    }
+
+    /// <summary>Best-effort OS read-ahead hint. Wrapped so a P/Invoke failure (missing
+    /// symbol, unsupported platform) never derails the guaranteed stride read.</summary>
+    private static void TryAdvise(List<(nint Ptr, long Bytes)> regions)
+    {
+        try
+        {
+            if (OperatingSystem.IsWindows()) AdviseWindows(regions);
+            else if (OperatingSystem.IsLinux()) AdviseLinux(regions);
+        }
+        catch
+        {
+            // Ignored: the stride read below still forces residency.
+        }
+    }
+
+    // ── Windows: PrefetchVirtualMemory ──────────────────────────────────────
+    private static void AdviseWindows(List<(nint Ptr, long Bytes)> regions)
+    {
+        int n = 0;
+        foreach (var (ptr, bytes) in regions)
+            if (ptr != 0 && bytes > 0) n++;
+        if (n == 0) return;
+
+        var entries = new Win32MemoryRangeEntry[n];
+        int j = 0;
+        foreach (var (ptr, bytes) in regions)
+        {
+            if (ptr == 0 || bytes <= 0) continue;
+            entries[j].VirtualAddress = ptr;
+            entries[j].NumberOfBytes = (nuint)bytes;
+            j++;
+        }
+        // Return value is intentionally ignored: this is an asynchronous, best-effort
+        // hint; the stride read is the residency guarantee.
+        fixed (Win32MemoryRangeEntry* e = entries)
+            _ = PrefetchVirtualMemory(GetCurrentProcess(), (nuint)n, e, 0);
+    }
+
+    [StructLayout(LayoutKind.Sequential)]
+    private struct Win32MemoryRangeEntry
+    {
+        public nint VirtualAddress;
+        public nuint NumberOfBytes;
+    }
+
+    [LibraryImport("kernel32.dll", EntryPoint = "PrefetchVirtualMemory")]
+    private static partial int PrefetchVirtualMemory(nint hProcess, nuint numberOfEntries,
+        Win32MemoryRangeEntry* virtualAddresses, uint flags);
+
+    [LibraryImport("kernel32.dll", EntryPoint = "GetCurrentProcess")]
+    private static partial nint GetCurrentProcess();
+
+    // ── Linux: posix_madvise(WILLNEED) ──────────────────────────────────────
+    private static void AdviseLinux(List<(nint Ptr, long Bytes)> regions)
+    {
+        const int posixMadvWillNeed = 3;
+        long pageSize = Environment.SystemPageSize;
+        foreach (var (ptr, bytes) in regions)
+        {
+            if (ptr == 0 || bytes <= 0) continue;
+            // posix_madvise wants a page-aligned address; round down and extend length.
+            long addr = ptr;
+            long aligned = addr & ~(pageSize - 1);
+            long len = bytes + (addr - aligned);
+            _ = posix_madvise((nint)aligned, (nuint)len, posixMadvWillNeed);
+        }
+    }
+
+    [LibraryImport("libc", EntryPoint = "posix_madvise")]
+    private static partial int posix_madvise(nint addr, nuint length, int advice);
+}
diff --git a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
new file mode 100644
index 0000000..93c290b
--- /dev/null
+++ b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
@@ -0,0 +1,202 @@
+using System.Runtime.InteropServices;
+using SharpInference.Engine;
+
+namespace SharpInference.Tests.ForwardPass;
+
+/// <summary>
+/// Unit tests for the issue #221 mmap pre-fault helper. The gating decision is tested
+/// as a pure function (no memory, no environment); a handful of integration tests
+/// exercise the real sweep over small <see cref="NativeMemory"/> buffers and the
+/// SHARPI_PREFAULT kill switch / RAM-fit skip (which must bail out before touching the
+/// claimed bytes — verified by claiming far more than is actually allocated).
+/// </summary>
+public sealed class MmapPrefaultTests
+{
+    private const long Gib = 1L << 30;
+
+    // ── Pure gating decision ────────────────────────────────────────────────
+
+    [Fact]
+    public void ShouldRun_NoBytes_IsFalse()
+    {
+        Assert.False(MmapPrefault.ShouldRun(null, 0, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason));
+        Assert.Contains("no mapped weights", reason);
+    }
+
+    [Fact]
+    public void ShouldRun_ModeZero_IsDisabled()
+    {
+        Assert.False(MmapPrefault.ShouldRun("0", 4 * Gib, 64 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason));
+        Assert.Contains("disabled", reason);
+    }
+
+    [Fact]
+    public void ShouldRun_ModeOne_ForcesEvenWhenOverRam()
+    {
+        // Force bypasses the RAM-fit heuristic entirely.
+        Assert.True(MmapPrefault.ShouldRun("1", 100 * Gib, 8 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason));
+        Assert.Contains("forced", reason);
+    }
+
+    [Fact]
+    public void ShouldRun_Auto_FitsInRam_Runs()
+    {
+        Assert.True(MmapPrefault.ShouldRun(null, 4 * Gib, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out _));
+    }
+
+    [Fact]
+    public void ShouldRun_Auto_ExceedsEightyPercent_Skips()
+    {
+        // 14 GiB mapped > 80% of 16 GiB (= 12.8 GiB) → skip rather than thrash.
+        Assert.False(MmapPrefault.ShouldRun(null, 14 * Gib, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason));
+        Assert.Contains("exceeds", reason);
+    }
+
+    [Fact]
+    public void ShouldRun_Auto_ExactlyEightyPercent_Runs()
+    {
+        // Boundary: the gate uses a strict '>' so exactly 80% still runs.
+        long avail = 16 * Gib;
+        Assert.True(MmapPrefault.ShouldRun(null, avail / 10 * 8, avail, MmapPrefault.RamGate.FitsInRam, out _));
+    }
+
+    [Fact]
+    public void ShouldRun_AlwaysGate_IgnoresRamHeuristic()
+    {
+        // The fully-CPU-resident passes prefault regardless of the 80% threshold.
+        Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 16 * Gib, MmapPrefault.RamGate.Always, out _));
+    }
+
+    [Fact]
+    public void ShouldRun_UnknownRam_DoesNotSkip()
+    {
+        // availRamBytes <= 0 means "couldn't measure" — don't skip on a guess.
+        Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 0, MmapPrefault.RamGate.FitsInRam, out _));
+    }
+
+    // ── IsDisabled / env kill switch ────────────────────────────────────────
+
+    [Fact]
+    public void IsDisabled_ReflectsEnv()
+    {
+        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        try
+        {
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0");
+            Assert.True(MmapPrefault.IsDisabled());
+
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "1");
+            Assert.False(MmapPrefault.IsDisabled());
+
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null);
+            Assert.False(MmapPrefault.IsDisabled());
+        }
+        finally
+        {
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
+        }
+    }
+
+    // ── Integration: real sweep over small buffers ──────────────────────────
+
+    [Fact]
+    public unsafe void Run_SmallBuffers_FaultsAndReportsBytes()
+    {
+        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); // auto
+        const long sizeA = 1 << 20; // 1 MiB (spans many pages + a chunk boundary downstream)
+        const long sizeB = 64 << 10; // 64 KiB
+        void* a = NativeMemory.Alloc((nuint)sizeA);
+        void* b = NativeMemory.Alloc((nuint)sizeB);
+        try
+        {
+            new Span<byte>(a, (int)sizeA).Fill(1);
+            new Span<byte>(b, (int)sizeB).Fill(2);
+
+            var regions = new List<(nint, long)> { ((nint)a, sizeA), ((nint)b, sizeB) };
+            var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always);
+
+            Assert.True(result.Ran);
+            Assert.Equal(sizeA + sizeB, result.Bytes);
+        }
+        finally
+        {
+            NativeMemory.Free(a);
+            NativeMemory.Free(b);
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
+        }
+    }
+
+    [Fact]
+    public unsafe void Run_NullAndZeroRegions_AreSkipped()
+    {
+        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null);
+        const long size = 4096;
+        void* a = NativeMemory.Alloc((nuint)size);
+        try
+        {
+            new Span<byte>(a, (int)size).Clear();
+            var regions = new List<(nint, long)>
+            {
+                (0, size),        // null ptr → ignored
+                ((nint)a, 0),     // zero bytes → ignored
+                ((nint)a, size),  // the only real region
+            };
+            var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always);
+
+            Assert.True(result.Ran);
+            Assert.Equal(size, result.Bytes); // only the valid region counts
+        }
+        finally
+        {
+            NativeMemory.Free(a);
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
+        }
+    }
+
+    [Fact]
+    public unsafe void Run_Disabled_SkipsWithoutTouching()
+    {
+        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0");
+        // Tiny real allocation, but the region claims 1 TiB: if Run tried to stride-read
+        // it the process would fault. The kill switch must bail out before any access.
+        void* a = NativeMemory.Alloc(16);
+        try
+        {
+            var regions = new List<(nint, long)> { ((nint)a, 1L << 40) };
+            var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always);
+
+            Assert.False(result.Ran);
+            Assert.Contains("disabled", result.Reason);
+        }
+        finally
+        {
+            NativeMemory.Free(a);
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
+        }
+    }
+
+    [Fact]
+    public unsafe void Run_AutoExceedsRam_SkipsWithoutTouching()
+    {
+        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
+        Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); // auto
+        void* a = NativeMemory.Alloc(16);
+        try
+        {
+            // 1 PiB claimed > 80% of any real machine's RAM → skipped before reading.
+            var regions = new List<(nint, long)> { ((nint)a, 1L << 50) };
+            var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.FitsInRam);
+
+            Assert.False(result.Ran);
+            Assert.Contains("exceeds", result.Reason);
+        }
+        finally
+        {
+            NativeMemory.Free(a);
+            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
+        }
+    }
+}

From 761c917be586e0ffb9946c1d4b80e35b0bd9cbfe Mon Sep 17 00:00:00 2001
From: Pekka Heikura <pekkah@gmail.com>
Date: Mon, 15 Jun 2026 15:41:39 +0300
Subject: [PATCH 2/2] review: address code-review findings on PR #257

- Migrate ForwardPass + HybridGdnForwardPass off their hand-rolled per-page
  sweeps onto MmapPrefault.Run(..., RamGate.Always). Makes RamGate.Always a
  live (not test-only) code path, unifies the SHARPI_PREFAULT kill switch +
  logging + OS read-ahead across all five passes, and removes the now-unused
  MmapPrefault.IsDisabled() (+ its test). Behaviour is unchanged: Always still
  sweeps the whole model unless SHARPI_PREFAULT=0.
- Resolve GPU-trunk routed experts tolerantly (FindTensor, skip if absent)
  instead of ResolveCpuWeight, which threw -- a pre-fault must never make an
  otherwise-loadable model fail to load.
- Cover the CUDA GPU-SLRU MoE path symmetrically with Vulkan: when experts
  stream through the SLRU (not CPU-MoE), fault blk.{0..nGpu}.ffn_*_exps so the
  first request's cache fills don't demand-page off disk.

Smoke-tested: pure-CPU ForwardPass logs "Pre-faulted 0.74 GiB" and decodes
coherently. 12 unit tests green; full Release build clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../CudaHybridForwardPass.cs                  | 30 ++++++++-
 src/SharpInference.Engine/ForwardPass.cs      | 55 +++++++---------
 .../HybridForwardPass.cs                      | 13 +++-
 .../HybridGdnForwardPass.cs                   | 63 +++++++++----------
 src/SharpInference.Engine/MmapPrefault.cs     |  6 --
 .../MmapPrefaultTests.cs                      | 23 -------
 6 files changed, 93 insertions(+), 97 deletions(-)

diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs
index 14479ba..8b3ccca 100644
--- a/src/SharpInference.Engine/CudaHybridForwardPass.cs
+++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs
@@ -2765,6 +2765,13 @@ void Add(CpuWeightRef[]? arr)
             if (arr is null) return;
             foreach (var w in arr) Add1(w);
         }
+        // Resolve a tensor by name straight from the mmap, tolerating absence — a
+        // pre-fault must never make an otherwise-loadable model fail to load.
+        void AddByName(string name)
+        {
+            if (_model.FindTensor(name) is { } info)
+                regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
+        }
 
         // Embedding/output mmap refs are read at inference only when they're NOT
         // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
@@ -2789,8 +2796,27 @@ void Add(CpuWeightRef[]? arr)
             Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
         }
 
-        // CPU-MoE routed experts for the GPU-trunk layers (the -g -1 cold-start cost).
-        Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps);
+        if (_isMoE && _nGpuLayers > 0)
+        {
+            if (_cpuMoe)
+            {
+                // CPU-MoE: GPU-trunk routed experts run on the CPU from these cached
+                // mmap refs every token (the -g -1 cold-start cost).
+                Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps);
+            }
+            else
+            {
+                // GPU-SLRU MoE: the routed experts aren't cached on the host, but the
+                // SLRU streams each one from the mmap on first use. Fault them so the
+                // first request's cache fills don't stall (mirrors the Vulkan path).
+                for (int li = 0; li < _nGpuLayers; li++)
+                {
+                    AddByName($"blk.{li}.ffn_gate_exps.weight");
+                    AddByName($"blk.{li}.ffn_up_exps.weight");
+                    AddByName($"blk.{li}.ffn_down_exps.weight");
+                }
+            }
+        }
 
         // Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections.
         if (_pleTokenEmbed is { } ple) Add1(ple);
diff --git a/src/SharpInference.Engine/ForwardPass.cs b/src/SharpInference.Engine/ForwardPass.cs
index c5d099f..ed0e303 100644
--- a/src/SharpInference.Engine/ForwardPass.cs
+++ b/src/SharpInference.Engine/ForwardPass.cs
@@ -468,8 +468,7 @@ static long F32Bytes(in TensorRef t) =>
         _dequantCacheCovers = _dequantCacheEnabled && fullF32Bytes > 0
             && _dequantCacheBudgetBytes >= fullF32Bytes;
 
-        // SHARPI_PREFAULT=0 is the global kill switch (issue #221).
-        if (!MmapPrefault.IsDisabled()) PrefaultWeights();
+        PrefaultWeights();
     }
 
     /// <summary>
@@ -508,54 +507,48 @@ public static long MbToBudgetBytes(long mb) =>
         : mb * 1024 * 1024;
 
     /// <summary>
-    /// Touch every 4KB page of all weight tensors to force OS page-in,
-    /// eliminating soft page faults during inference.
+    /// Pre-fault every weight page so the first request doesn't stall on demand paging
+    /// (issue #221). This is the fully-CPU pass — the whole model is mmap-resident, the
+    /// user chose to run it from RAM, so <see cref="MmapPrefault.RamGate.Always"/> skips
+    /// the RAM-fit heuristic (subject only to the <c>SHARPI_PREFAULT=0</c> kill switch).
     /// </summary>
     private void PrefaultWeights()
     {
-        var tensors = new List<TensorRef> { _embTensor, _outputNorm, _outputWeight };
+        var regions = new List<(nint, long)>();
+        void Add(TensorRef t)
+        {
+            if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize));
+        }
+
+        Add(_embTensor); Add(_outputNorm); Add(_outputWeight);
         int L = _hp.NumLayers;
         for (int i = 0; i < L; i++)
         {
             bool kvShared = _layerKvSrc is not null && _layerKvSrc[i] >= 0;
-            tensors.Add(_attnNorm[i]);
-            tensors.Add(_wq[i]); tensors.Add(_wo[i]);
-            // k_eq_v global layers have no attn_v (_wv[i] is default/unset).
-            if (!kvShared) { tensors.Add(_wk[i]); if (_wv[i].DataPtr is not null) tensors.Add(_wv[i]); }
-            tensors.Add(_ffnNorm[i]);
-            if (_postAttnNorm is not null) tensors.Add(_postAttnNorm[i]);
-            if (_postFfwNorm is not null) tensors.Add(_postFfwNorm[i]);
+            Add(_attnNorm[i]);
+            Add(_wq[i]); Add(_wo[i]);
+            // k_eq_v global layers have no attn_v (_wv[i] is default/unset; Add skips null).
+            if (!kvShared) { Add(_wk[i]); Add(_wv[i]); }
+            Add(_ffnNorm[i]);
+            if (_postAttnNorm is not null) Add(_postAttnNorm[i]);
+            if (_postFfwNorm is not null) Add(_postFfwNorm[i]);
 
             if (_hp.IsMoE)
             {
-                tensors.Add(_wGateInp![i]);
-                tensors.Add(_wGateExps![i]); tensors.Add(_wUpExps![i]); tensors.Add(_wDownExps![i]);
+                Add(_wGateInp![i]);
+                Add(_wGateExps![i]); Add(_wUpExps![i]); Add(_wDownExps![i]);
                 if (_hp.HasSharedExpert)
                 {
-                    tensors.Add(_wGateShexp![i]); tensors.Add(_wUpShexp![i]); tensors.Add(_wDownShexp![i]);
+                    Add(_wGateShexp![i]); Add(_wUpShexp![i]); Add(_wDownShexp![i]);
                 }
             }
             else
             {
-                tensors.Add(_wGate[i]); tensors.Add(_wUp[i]); tensors.Add(_wDown[i]);
+                Add(_wGate[i]); Add(_wUp[i]); Add(_wDown[i]);
             }
         }
 
-        long touchSum = 0;
-        Parallel.ForEach(tensors, tensor =>
-        {
-            long size = tensor.Info.ByteSize;
-            byte* ptr = tensor.DataPtr;
-            long localSum = 0;
-            for (long off = 0; off < size; off += 4096)
-                localSum += ptr[off];
-            if (size > 0)
-                localSum += ptr[size - 1];
-            Interlocked.Add(ref touchSum, localSum);
-        });
-
-        // Prevent dead-code elimination
-        if (touchSum == long.MinValue) Console.Write(touchSum);
+        MmapPrefault.Run("ForwardPass", regions, MmapPrefault.RamGate.Always);
     }
 
     public PagedKvCache Cache => _kvCache;
diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs
index 7ec3712..c1d5e4a 100644
--- a/src/SharpInference.Engine/HybridForwardPass.cs
+++ b/src/SharpInference.Engine/HybridForwardPass.cs
@@ -1147,6 +1147,13 @@ void Add(CpuWeightRef[]? arr)
             if (arr is null) return;
             foreach (var w in arr) Add1(w);
         }
+        // Resolve a tensor by name straight from the mmap, tolerating absence — a
+        // pre-fault must never make an otherwise-loadable model fail to load.
+        void AddByName(string name)
+        {
+            if (_model.FindTensor(name) is { } info)
+                regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
+        }
 
         // Embedding/output mmap refs are read at inference only when they're NOT
         // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
@@ -1171,9 +1178,9 @@ void Add(CpuWeightRef[]? arr)
             // don't stall (mirrors the CUDA class's _cpuMoe* coverage).
             for (int li = 0; li < _nGpuLayers; li++)
             {
-                Add1(ResolveCpuWeight($"blk.{li}.ffn_gate_exps.weight"));
-                Add1(ResolveCpuWeight($"blk.{li}.ffn_up_exps.weight"));
-                Add1(ResolveCpuWeight($"blk.{li}.ffn_down_exps.weight"));
+                AddByName($"blk.{li}.ffn_gate_exps.weight");
+                AddByName($"blk.{li}.ffn_up_exps.weight");
+                AddByName($"blk.{li}.ffn_down_exps.weight");
             }
         }
         else
diff --git a/src/SharpInference.Engine/HybridGdnForwardPass.cs b/src/SharpInference.Engine/HybridGdnForwardPass.cs
index 0532fd8..6af6890 100644
--- a/src/SharpInference.Engine/HybridGdnForwardPass.cs
+++ b/src/SharpInference.Engine/HybridGdnForwardPass.cs
@@ -673,8 +673,7 @@ public HybridGdnForwardPass(GgufModel model, IComputeBackend backend, ModelHyper
             }
         }
 
-        // SHARPI_PREFAULT=0 is the global kill switch (issue #221).
-        if (!MmapPrefault.IsDisabled()) PrefaultWeights();
+        PrefaultWeights();
     }
 
     // ============================================================
@@ -2837,65 +2836,65 @@ private TensorRef ResolveTensor(string name)
         return new TensorRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>
+    /// Pre-fault every weight page so the first request doesn't stall on demand paging
+    /// (issue #221). The whole model is mmap-resident on this CPU/Vulkan GDN pass, so
+    /// <see cref="MmapPrefault.RamGate.Always"/> skips the RAM-fit heuristic (subject only
+    /// to the <c>SHARPI_PREFAULT=0</c> kill switch).
+    /// </summary>
     private void PrefaultWeights()
     {
-        var tensors = new List<TensorRef> { _embTensor, _outputNorm, _outputWeight };
+        var regions = new List<(nint, long)>();
+        void Add(TensorRef t)
+        {
+            if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize));
+        }
+
+        Add(_embTensor); Add(_outputNorm); Add(_outputWeight);
         int L = _hp.NumLayers;
         for (int i = 0; i < L; i++)
         {
-            tensors.Add(_attnNorm[i]);
-            tensors.Add(_postAttnNorm[i]);
+            Add(_attnNorm[i]);
+            Add(_postAttnNorm[i]);
             if (_hp.IsMoE)
             {
-                tensors.Add(_wGateInp[i]);
-                tensors.Add(_wGateShexp[i]); tensors.Add(_wUpShexp[i]); tensors.Add(_wDownShexp[i]);
-                tensors.Add(_wGateExps[i]); tensors.Add(_wUpExps[i]); tensors.Add(_wDownExps[i]);
+                Add(_wGateInp[i]);
+                Add(_wGateShexp[i]); Add(_wUpShexp[i]); Add(_wDownShexp[i]);
+                Add(_wGateExps[i]); Add(_wUpExps[i]); Add(_wDownExps[i]);
             }
             else
             {
-                tensors.Add(_wFfnGate[i]); tensors.Add(_wFfnUp[i]); tensors.Add(_wFfnDown[i]);
+                Add(_wFfnGate[i]); Add(_wFfnUp[i]); Add(_wFfnDown[i]);
             }
             if (_hp.LayerTypes![i] == LayerType.Attention)
             {
-                tensors.Add(_wQGate[i]); tensors.Add(_wK[i]); tensors.Add(_wV[i]); tensors.Add(_wO[i]);
+                Add(_wQGate[i]); Add(_wK[i]); Add(_wV[i]); Add(_wO[i]);
             }
             else
             {
-                tensors.Add(_wQkv[i]); tensors.Add(_wZGate[i]); tensors.Add(_ssmOut[i]);
-                tensors.Add(_ssmAlpha[i]); tensors.Add(_ssmBeta[i]);
+                Add(_wQkv[i]); Add(_wZGate[i]); Add(_ssmOut[i]);
+                Add(_ssmAlpha[i]); Add(_ssmBeta[i]);
             }
         }
 
         if (_hasMtp)
         {
-            tensors.Add(_mtpAttnNorm);
-            tensors.Add(_mtpWQGate); tensors.Add(_mtpWK); tensors.Add(_mtpWV); tensors.Add(_mtpWO);
-            tensors.Add(_mtpPostAttnNorm);
+            Add(_mtpAttnNorm);
+            Add(_mtpWQGate); Add(_mtpWK); Add(_mtpWV); Add(_mtpWO);
+            Add(_mtpPostAttnNorm);
             if (_mtpIsMoE)
             {
-                tensors.Add(_mtpWGateInp);
-                tensors.Add(_mtpWGateShexp); tensors.Add(_mtpWUpShexp); tensors.Add(_mtpWDownShexp);
-                tensors.Add(_mtpWGateExps); tensors.Add(_mtpWUpExps); tensors.Add(_mtpWDownExps);
+                Add(_mtpWGateInp);
+                Add(_mtpWGateShexp); Add(_mtpWUpShexp); Add(_mtpWDownShexp);
+                Add(_mtpWGateExps); Add(_mtpWUpExps); Add(_mtpWDownExps);
             }
             else
             {
-                tensors.Add(_mtpFfnGate); tensors.Add(_mtpFfnUp); tensors.Add(_mtpFfnDown);
+                Add(_mtpFfnGate); Add(_mtpFfnUp); Add(_mtpFfnDown);
             }
         }
 
-        long touchSum = 0;
-        Parallel.ForEach(tensors, tensor =>
-        {
-            long size = tensor.Info.ByteSize;
-            byte* ptr = tensor.DataPtr;
-            long localSum = 0;
-            for (long off = 0; off < size; off += 4096)
-                localSum += ptr[off];
-            if (size > 0)
-                localSum += ptr[size - 1];
-            Interlocked.Add(ref touchSum, localSum);
-        });
-        if (touchSum == long.MinValue) Console.Write(touchSum);
+        MmapPrefault.Run("HybridGdnForwardPass", regions, MmapPrefault.RamGate.Always);
     }
 
     // ============================================================
diff --git a/src/SharpInference.Engine/MmapPrefault.cs b/src/SharpInference.Engine/MmapPrefault.cs
index f333a44..d3dcc37 100644
--- a/src/SharpInference.Engine/MmapPrefault.cs
+++ b/src/SharpInference.Engine/MmapPrefault.cs
@@ -41,12 +41,6 @@ internal enum RamGate
 
     internal readonly record struct Result(bool Ran, long Bytes, double Seconds, string Reason);
 
-    /// <summary>True when <c>SHARPI_PREFAULT=0</c> disables all prefaulting. Lets the
-    /// fully-CPU-resident passes share the one global kill switch without going through
-    /// <see cref="Run"/>.</summary>
-    internal static bool IsDisabled() =>
-        Environment.GetEnvironmentVariable("SHARPI_PREFAULT") == "0";
-
     /// <summary>
     /// Pure gating decision, factored out so it can be unit-tested without touching
     /// memory or the environment. <paramref name="mode"/> is the raw
diff --git a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
index 93c290b..09b3456 100644
--- a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
+++ b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs
@@ -74,29 +74,6 @@ public void ShouldRun_UnknownRam_DoesNotSkip()
         Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 0, MmapPrefault.RamGate.FitsInRam, out _));
     }
 
-    // ── IsDisabled / env kill switch ────────────────────────────────────────
-
-    [Fact]
-    public void IsDisabled_ReflectsEnv()
-    {
-        var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT");
-        try
-        {
-            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0");
-            Assert.True(MmapPrefault.IsDisabled());
-
-            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "1");
-            Assert.False(MmapPrefault.IsDisabled());
-
-            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null);
-            Assert.False(MmapPrefault.IsDisabled());
-        }
-        finally
-        {
-            Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev);
-        }
-    }
-
     // ── Integration: real sweep over small buffers ──────────────────────────
 
     [Fact]