pekkah · pekkah · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs
@@ -930,48 +930,12 @@ void TraceVram(string label)
             _gpuRopeFreqs = UploadWeight("rope_freqs.weight");
         }
 
-        // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
-        // to ensure OS pages them into RAM before the first forward pass.
-        if (_nCpuLayers > 0)
-        {
-            Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
-            long touchSum = 0;
-            IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
-            if (_isMoE)
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGateInp!)
-                    .Concat(_cpuWGateExps!)
-                    .Concat(_cpuWUpExps!)
-                    .Concat(_cpuWDownExps!);
-                if (_hasSharedExpert)
-                {
-                    weightsToTouch = weightsToTouch
-                        .Concat(_cpuWGateShexp!)
-                        .Concat(_cpuWUpShexp!)
-                        .Concat(_cpuWDownShexp!);
-                }
-            }
-            else
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGate)
-                    .Concat(_cpuWUp)
-                    .Concat(_cpuWDown);
-            }
-
-            foreach (var wRef in weightsToTouch)
-            {
-                // Skip un-resolved slots — KV-share layers on Gemma 4 leave attn_k /
-                // attn_v unresolved by design (the source layer's projections are
-                // reused via the alias dispatch).
-                if (wRef.DataPtr == null) continue;
-                touchSum += wRef.DataPtr[0];
-                long size = wRef.Info.ByteSize;
-                if (size > 64) touchSum += wRef.DataPtr[size - 1];
-            }
-            Console.Error.WriteLine($" done. (touch={touchSum})");
-        }
+        // Pre-fault every CPU-resident mmap weight page so the first request doesn't
+        // stall on demand paging (issue #221). NOT gated on _nCpuLayers > 0: the
+        // CPU-MoE routed experts and the Gemma 4 PLE table are CPU-resident even when
+        // every transformer layer is GPU-offloaded (-g -1), and those are the dominant
+        // cold-start cost. MmapPrefault filters empty configs and honours SHARPI_PREFAULT.
+        MmapPrefault.Run("CudaHybridForwardPass", BuildCpuPrefaultRegions());
 
         if (_tqEnabled && _nCpuLayers > 0)
         {
@@ -2783,6 +2747,85 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep. Covers per-CPU-layer weights, the CPU-MoE routed experts (the
+    /// big cold-start cost, present even with all layers GPU-offloaded), and the
+    /// Gemma 4 PLE table. Unresolved slots (null <c>DataPtr</c>) — e.g. Gemma 4
+    /// KV-share / k==v layers — are skipped. Biases and QK-norms are excluded: they're
+    /// dequantized into separate buffers, not read from the mmap at inference time.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+        // Resolve a tensor by name straight from the mmap, tolerating absence — a
+        // pre-fault must never make an otherwise-loadable model fail to load.
+        void AddByName(string name)
+        {
+            if (_model.FindTensor(name) is { } info)
+                regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
+        }
+
+        // Embedding/output mmap refs are read at inference only when they're NOT
+        // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
+        // GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
+        if (_gpuEmbedding is null) Add1(_cpuEmbedding);
+        if (_gpuOutputWeight is null)
+        {
+            Add1(_cpuOutputNorm);
+            if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
+        }
+
+        Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo);
+        Add(_cpuFfnNorm); Add(_cpuPostAttnNorm); Add(_cpuPostFfwNorm);
+
+        if (_isMoE)
+        {
+            Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
+            if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }
+        }
+        else
+        {
+            Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
+        }
+
+        if (_isMoE && _nGpuLayers > 0)
+        {
+            if (_cpuMoe)
+            {
+                // CPU-MoE: GPU-trunk routed experts run on the CPU from these cached
+                // mmap refs every token (the -g -1 cold-start cost).
+                Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps);
+            }
+            else
+            {
+                // GPU-SLRU MoE: the routed experts aren't cached on the host, but the
+                // SLRU streams each one from the mmap on first use. Fault them so the
+                // first request's cache fills don't stall (mirrors the Vulkan path).
+                for (int li = 0; li < _nGpuLayers; li++)
+                {
+                    AddByName($"blk.{li}.ffn_gate_exps.weight");
+                    AddByName($"blk.{li}.ffn_up_exps.weight");
+                    AddByName($"blk.{li}.ffn_down_exps.weight");
+                }
+            }
+        }
+
+        // Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections.
+        if (_pleTokenEmbed is { } ple) Add1(ple);
+        if (_perLayerProjNorm is { } pln) Add1(pln);
+        Add(_cpuInpGate); Add(_cpuPleProj); Add(_cpuPlePostNorm);
+
+        return regions;
+    }
+
     private float* LoadCpuBias(string name, int count)
     {
         var info = _model.FindTensor(name)

diff --git a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
@@ -1414,6 +1414,13 @@ void TraceVram(string label)
             _logitsBuf2 = Array.Empty<float>();
             _cpuNormBuf2 = _cpuMoeHidden2 = _lastHiddenT1 = null;
         }
+
+        // Pre-fault CPU-resident mmap weight pages (issue #221). On the CPU-MoE config
+        // (the auto-selected winner on 12 GB) the routed experts / dense FFN weights are
+        // paged in lazily; without this the first request faults them all on the critical
+        // path, ~5× slower than warm. MmapPrefault honours SHARPI_PREFAULT and the
+        // RAM-fit heuristic, and no-ops when nothing is CPU-resident (full-GPU GDN).
+        MmapPrefault.Run("CudaHybridGdnForwardPass", BuildCpuPrefaultRegions());
     }
 
     // =================================================================
@@ -5231,6 +5238,40 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep: the CPU-MoE routed experts (or dense FFN weights), the
+    /// SHARPI_CPU_GDN debug GDN weights, and the MoE-MTP head experts. Arrays that
+    /// aren't allocated for this config are null; unpopulated slots (e.g. the GDN
+    /// arrays when not in CPU-GDN mode) have a null <c>DataPtr</c> and are skipped.
+    /// Everything dequantized via LoadF32Tensor/LoadConv1d lives in separate buffers,
+    /// not the mmap, and is excluded.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+
+        // Trunk: CPU-MoE routed experts, or dense FFN weights (Qwen3.6-27B-MTP).
+        Add(_cpuFfnGateInp); Add(_cpuFfnGateExps); Add(_cpuFfnUpExps); Add(_cpuFfnDownExps);
+        Add(_cpuWFfnGate); Add(_cpuWFfnUp); Add(_cpuWFfnDown);
+
+        // SHARPI_CPU_GDN=1 debug path (arrays always allocated, populated only then).
+        Add(_cpuWQkv); Add(_cpuWZGate); Add(_cpuSsmOut); Add(_cpuSsmAlpha); Add(_cpuSsmBeta);
+
+        // MoE-MTP head routed experts (one extra layer; null DataPtr when absent).
+        Add1(_cpuMtpFfnGateInp); Add1(_cpuMtpFfnGateExps);
+        Add1(_cpuMtpFfnUpExps); Add1(_cpuMtpFfnDownExps);
+
+        return regions;
+    }
+
     private Tensor UploadWeight(string name)
     {
         var info = _model.FindTensor(name)

diff --git a/src/SharpInference.Engine/ForwardPass.cs b/src/SharpInference.Engine/ForwardPass.cs
@@ -507,54 +507,48 @@ public static long MbToBudgetBytes(long mb) =>
         : mb * 1024 * 1024;
 
     /// <summary>
-    /// Touch every 4KB page of all weight tensors to force OS page-in,
-    /// eliminating soft page faults during inference.
+    /// Pre-fault every weight page so the first request doesn't stall on demand paging
+    /// (issue #221). This is the fully-CPU pass — the whole model is mmap-resident, the
+    /// user chose to run it from RAM, so <see cref="MmapPrefault.RamGate.Always"/> skips
+    /// the RAM-fit heuristic (subject only to the <c>SHARPI_PREFAULT=0</c> kill switch).
     /// </summary>
     private void PrefaultWeights()
     {
-        var tensors = new List<TensorRef> { _embTensor, _outputNorm, _outputWeight };
+        var regions = new List<(nint, long)>();
+        void Add(TensorRef t)
+        {
+            if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize));
+        }
+
+        Add(_embTensor); Add(_outputNorm); Add(_outputWeight);
         int L = _hp.NumLayers;
         for (int i = 0; i < L; i++)
         {
             bool kvShared = _layerKvSrc is not null && _layerKvSrc[i] >= 0;
-            tensors.Add(_attnNorm[i]);
-            tensors.Add(_wq[i]); tensors.Add(_wo[i]);
-            // k_eq_v global layers have no attn_v (_wv[i] is default/unset).
-            if (!kvShared) { tensors.Add(_wk[i]); if (_wv[i].DataPtr is not null) tensors.Add(_wv[i]); }
-            tensors.Add(_ffnNorm[i]);
-            if (_postAttnNorm is not null) tensors.Add(_postAttnNorm[i]);
-            if (_postFfwNorm is not null) tensors.Add(_postFfwNorm[i]);
+            Add(_attnNorm[i]);
+            Add(_wq[i]); Add(_wo[i]);
+            // k_eq_v global layers have no attn_v (_wv[i] is default/unset; Add skips null).
+            if (!kvShared) { Add(_wk[i]); Add(_wv[i]); }
+            Add(_ffnNorm[i]);
+            if (_postAttnNorm is not null) Add(_postAttnNorm[i]);
+            if (_postFfwNorm is not null) Add(_postFfwNorm[i]);
 
             if (_hp.IsMoE)
             {
-                tensors.Add(_wGateInp![i]);
-                tensors.Add(_wGateExps![i]); tensors.Add(_wUpExps![i]); tensors.Add(_wDownExps![i]);
+                Add(_wGateInp![i]);
+                Add(_wGateExps![i]); Add(_wUpExps![i]); Add(_wDownExps![i]);
                 if (_hp.HasSharedExpert)
                 {
-                    tensors.Add(_wGateShexp![i]); tensors.Add(_wUpShexp![i]); tensors.Add(_wDownShexp![i]);
+                    Add(_wGateShexp![i]); Add(_wUpShexp![i]); Add(_wDownShexp![i]);
                 }
             }
             else
             {
-                tensors.Add(_wGate[i]); tensors.Add(_wUp[i]); tensors.Add(_wDown[i]);
+                Add(_wGate[i]); Add(_wUp[i]); Add(_wDown[i]);
             }
         }
 
-        long touchSum = 0;
-        Parallel.ForEach(tensors, tensor =>
-        {
-            long size = tensor.Info.ByteSize;
-            byte* ptr = tensor.DataPtr;
-            long localSum = 0;
-            for (long off = 0; off < size; off += 4096)
-                localSum += ptr[off];
-            if (size > 0)
-                localSum += ptr[size - 1];
-            Interlocked.Add(ref touchSum, localSum);
-        });
-
-        // Prevent dead-code elimination
-        if (touchSum == long.MinValue) Console.Write(touchSum);
+        MmapPrefault.Run("ForwardPass", regions, MmapPrefault.RamGate.Always);
     }
 
     public PagedKvCache Cache => _kvCache;

diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs
@@ -461,45 +461,11 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp
 
         _cpuKvCache = new KvCache(_nCpuLayers, _maxSeqLen, _numKvHeads, _headDim);
 
-        // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
-        // to ensure OS pages them into RAM before the first forward pass.
-        if (_nCpuLayers > 0)
-        {
-            Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
-            long touchSum = 0;
-            IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
-            if (_isMoE)
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGateInp!)
-                    .Concat(_cpuWGateExps!)
-                    .Concat(_cpuWUpExps!)
-                    .Concat(_cpuWDownExps!);
-                if (_hasSharedExpert)
-                {
-                    weightsToTouch = weightsToTouch
-                        .Concat(_cpuWGateShexp!)
-                        .Concat(_cpuWUpShexp!)
-                        .Concat(_cpuWDownShexp!);
-                }
-            }
-            else
-            {
-                weightsToTouch = weightsToTouch
-                    .Concat(_cpuWGate)
-                    .Concat(_cpuWUp)
-                    .Concat(_cpuWDown);
-            }
-
-            foreach (var wRef in weightsToTouch)
-            {
-                // Touch first and last cache line of each weight tensor
-                touchSum += wRef.DataPtr[0];
-                long size = wRef.Info.ByteSize;
-                if (size > 64) touchSum += wRef.DataPtr[size - 1];
-            }
-            Console.Error.WriteLine($" done. (touch={touchSum})");
-        }
+        // Pre-fault every CPU-resident mmap weight page so the first request doesn't
+        // stall on demand paging (issue #221). The CPU embedding/output tensors are
+        // resolved even on a pure-GPU split (CPU embed lookup / lm_head), so the sweep
+        // is not gated on _nCpuLayers > 0. MmapPrefault honours SHARPI_PREFAULT.
+        MmapPrefault.Run("HybridForwardPass", BuildCpuPrefaultRegions());
 
         if (_tqEnabled && _nCpuLayers > 0)
         {
@@ -1165,6 +1131,66 @@ private CpuWeightRef ResolveCpuWeight(string name)
         return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
     }
 
+    /// <summary>Collect every CPU-resident mmap weight region for the issue #221
+    /// pre-fault sweep: the CPU embedding/output tensors plus the per-CPU-layer
+    /// attention/FFN weights (dense or MoE + shared experts). Biases and QK-norms are
+    /// excluded — they're dequantized into separate buffers, not read from the mmap.</summary>
+    private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
+    {
+        var regions = new List<(nint, long)>();
+        void Add1(CpuWeightRef w)
+        {
+            if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
+        }
+        void Add(CpuWeightRef[]? arr)
+        {
+            if (arr is null) return;
+            foreach (var w in arr) Add1(w);
+        }
+        // Resolve a tensor by name straight from the mmap, tolerating absence — a
+        // pre-fault must never make an otherwise-loadable model fail to load.
+        void AddByName(string name)
+        {
+            if (_model.FindTensor(name) is { } info)
+                regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
+        }
+
+        // Embedding/output mmap refs are read at inference only when they're NOT
+        // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
+        // GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
+        if (_gpuEmbedding is null) Add1(_cpuEmbedding);
+        if (_gpuOutputWeight is null)
+        {
+            Add1(_cpuOutputNorm);
+            if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
+        }
+
+        Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo); Add(_cpuFfnNorm);
+
+        if (_isMoE)
+        {
+            Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
+            if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }
+
+            // GPU-trunk routed experts live in the GPU SLRU cache, but every cache miss
+            // spills to GpuMoeFfnCpuFallback, which reads blk.{0..nGpu-1}.ffn_*_exps
+            // straight from the mmap on the CPU. Fault those too so first-token misses
+            // don't stall (mirrors the CUDA class's _cpuMoe* coverage).
+            for (int li = 0; li < _nGpuLayers; li++)
+            {
+                AddByName($"blk.{li}.ffn_gate_exps.weight");
+                AddByName($"blk.{li}.ffn_up_exps.weight");
+                AddByName($"blk.{li}.ffn_down_exps.weight");
+            }
+        }
+        else
+        {
+            Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
+        }
+
+        return regions;
+    }
+
     private float* LoadCpuBias(string name, int count)
     {
         var info = _model.FindTensor(name)