From 0d3a7a71d337aad51a09c6b0d992750e0ddb79b7 Mon Sep 17 00:00:00 2001 From: Pekka Heikura Date: Mon, 15 Jun 2026 15:22:57 +0300 Subject: [PATCH 1/2] perf(engine): real per-page mmap pre-fault for CPU-resident weights (#221) The hybrid "pre-fault" only touched 2 pages per tensor (DataPtr[0] + DataPtr[size-1]), and CudaHybridGdnForwardPass had none -- so the first request on CPU-MoE configs faulted every expert page on the critical path, ~5x slower than warm (the #210 bench protocol's "run each cell twice" workaround exists for exactly this). New shared MmapPrefault helper: a pure, testable gate (SHARPI_PREFAULT 0=off / 1=force, plus an 80%-of-RAM fit heuristic), best-effort OS read-ahead (PrefetchVirtualMemory on Windows, posix_madvise(WILLNEED) on Linux, both LibraryImport), then a parallel per-page stride read (32 MiB chunks) that guarantees residency and reports GiB/s. Wired into the three hybrid classes via BuildCpuPrefaultRegions(), now covering the full CPU-resident set the old gate missed -- the CPU-MoE routed experts and Gemma 4 PLE table, which are mmap-resident even at -g -1 (_nCpuLayers == 0). Embedding/output are skipped when GPU-resident. The pre-existing full-page prefaults in ForwardPass and HybridGdn- ForwardPass now honour the same SHARPI_PREFAULT=0 kill switch. Verified on a 4070 Ti: CudaHybridForwardPass warms 16.39 GiB of Coder-30B CPU-MoE experts; CudaHybridGdnForwardPass (no prefault before) warms 14.66 GiB of Carnice experts (0.5 GiB/s cold read = ~29s of fault I/O moved off the request path). Decode unchanged (prefault is read-only). 13 unit tests in MmapPrefaultTests. Co-Authored-By: Claude Opus 4.8 --- .../CudaHybridForwardPass.cs | 101 +++++---- .../CudaHybridGdnForwardPass.cs | 41 ++++ src/SharpInference.Engine/ForwardPass.cs | 3 +- .../HybridForwardPass.cs | 97 +++++---- .../HybridGdnForwardPass.cs | 3 +- src/SharpInference.Engine/MmapPrefault.cs | 204 ++++++++++++++++++ .../MmapPrefaultTests.cs | 202 +++++++++++++++++ 7 files changed, 568 insertions(+), 83 deletions(-) create mode 100644 src/SharpInference.Engine/MmapPrefault.cs create mode 100644 tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs index fcea42b..14479ba 100644 --- a/src/SharpInference.Engine/CudaHybridForwardPass.cs +++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs @@ -930,48 +930,12 @@ void TraceVram(string label) _gpuRopeFreqs = UploadWeight("rope_freqs.weight"); } - // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor - // to ensure OS pages them into RAM before the first forward pass. - if (_nCpuLayers > 0) - { - Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages..."); - long touchSum = 0; - IEnumerable weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo); - if (_isMoE) - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGateInp!) - .Concat(_cpuWGateExps!) - .Concat(_cpuWUpExps!) - .Concat(_cpuWDownExps!); - if (_hasSharedExpert) - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGateShexp!) - .Concat(_cpuWUpShexp!) - .Concat(_cpuWDownShexp!); - } - } - else - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGate) - .Concat(_cpuWUp) - .Concat(_cpuWDown); - } - - foreach (var wRef in weightsToTouch) - { - // Skip un-resolved slots — KV-share layers on Gemma 4 leave attn_k / - // attn_v unresolved by design (the source layer's projections are - // reused via the alias dispatch). - if (wRef.DataPtr == null) continue; - touchSum += wRef.DataPtr[0]; - long size = wRef.Info.ByteSize; - if (size > 64) touchSum += wRef.DataPtr[size - 1]; - } - Console.Error.WriteLine($" done. (touch={touchSum})"); - } + // Pre-fault every CPU-resident mmap weight page so the first request doesn't + // stall on demand paging (issue #221). NOT gated on _nCpuLayers > 0: the + // CPU-MoE routed experts and the Gemma 4 PLE table are CPU-resident even when + // every transformer layer is GPU-offloaded (-g -1), and those are the dominant + // cold-start cost. MmapPrefault filters empty configs and honours SHARPI_PREFAULT. + MmapPrefault.Run("CudaHybridForwardPass", BuildCpuPrefaultRegions()); if (_tqEnabled && _nCpuLayers > 0) { @@ -2783,6 +2747,59 @@ private CpuWeightRef ResolveCpuWeight(string name) return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info)); } + /// Collect every CPU-resident mmap weight region for the issue #221 + /// pre-fault sweep. Covers per-CPU-layer weights, the CPU-MoE routed experts (the + /// big cold-start cost, present even with all layers GPU-offloaded), and the + /// Gemma 4 PLE table. Unresolved slots (null DataPtr) — e.g. Gemma 4 + /// KV-share / k==v layers — are skipped. Biases and QK-norms are excluded: they're + /// dequantized into separate buffers, not read from the mmap at inference time. + private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions() + { + var regions = new List<(nint, long)>(); + void Add1(CpuWeightRef w) + { + if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize)); + } + void Add(CpuWeightRef[]? arr) + { + if (arr is null) return; + foreach (var w in arr) Add1(w); + } + + // Embedding/output mmap refs are read at inference only when they're NOT + // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the + // GiB-scale token_embd when it lives on the GPU avoids a large pointless read. + if (_gpuEmbedding is null) Add1(_cpuEmbedding); + if (_gpuOutputWeight is null) + { + Add1(_cpuOutputNorm); + if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias + } + + Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo); + Add(_cpuFfnNorm); Add(_cpuPostAttnNorm); Add(_cpuPostFfwNorm); + + if (_isMoE) + { + Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps); + if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); } + } + else + { + Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown); + } + + // CPU-MoE routed experts for the GPU-trunk layers (the -g -1 cold-start cost). + Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps); + + // Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections. + if (_pleTokenEmbed is { } ple) Add1(ple); + if (_perLayerProjNorm is { } pln) Add1(pln); + Add(_cpuInpGate); Add(_cpuPleProj); Add(_cpuPlePostNorm); + + return regions; + } + private float* LoadCpuBias(string name, int count) { var info = _model.FindTensor(name) diff --git a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs index c5e4794..e97d939 100644 --- a/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs +++ b/src/SharpInference.Engine/CudaHybridGdnForwardPass.cs @@ -1414,6 +1414,13 @@ void TraceVram(string label) _logitsBuf2 = Array.Empty(); _cpuNormBuf2 = _cpuMoeHidden2 = _lastHiddenT1 = null; } + + // Pre-fault CPU-resident mmap weight pages (issue #221). On the CPU-MoE config + // (the auto-selected winner on 12 GB) the routed experts / dense FFN weights are + // paged in lazily; without this the first request faults them all on the critical + // path, ~5× slower than warm. MmapPrefault honours SHARPI_PREFAULT and the + // RAM-fit heuristic, and no-ops when nothing is CPU-resident (full-GPU GDN). + MmapPrefault.Run("CudaHybridGdnForwardPass", BuildCpuPrefaultRegions()); } // ================================================================= @@ -5231,6 +5238,40 @@ private CpuWeightRef ResolveCpuWeight(string name) return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info)); } + /// Collect every CPU-resident mmap weight region for the issue #221 + /// pre-fault sweep: the CPU-MoE routed experts (or dense FFN weights), the + /// SHARPI_CPU_GDN debug GDN weights, and the MoE-MTP head experts. Arrays that + /// aren't allocated for this config are null; unpopulated slots (e.g. the GDN + /// arrays when not in CPU-GDN mode) have a null DataPtr and are skipped. + /// Everything dequantized via LoadF32Tensor/LoadConv1d lives in separate buffers, + /// not the mmap, and is excluded. + private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions() + { + var regions = new List<(nint, long)>(); + void Add1(CpuWeightRef w) + { + if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize)); + } + void Add(CpuWeightRef[]? arr) + { + if (arr is null) return; + foreach (var w in arr) Add1(w); + } + + // Trunk: CPU-MoE routed experts, or dense FFN weights (Qwen3.6-27B-MTP). + Add(_cpuFfnGateInp); Add(_cpuFfnGateExps); Add(_cpuFfnUpExps); Add(_cpuFfnDownExps); + Add(_cpuWFfnGate); Add(_cpuWFfnUp); Add(_cpuWFfnDown); + + // SHARPI_CPU_GDN=1 debug path (arrays always allocated, populated only then). + Add(_cpuWQkv); Add(_cpuWZGate); Add(_cpuSsmOut); Add(_cpuSsmAlpha); Add(_cpuSsmBeta); + + // MoE-MTP head routed experts (one extra layer; null DataPtr when absent). + Add1(_cpuMtpFfnGateInp); Add1(_cpuMtpFfnGateExps); + Add1(_cpuMtpFfnUpExps); Add1(_cpuMtpFfnDownExps); + + return regions; + } + private Tensor UploadWeight(string name) { var info = _model.FindTensor(name) diff --git a/src/SharpInference.Engine/ForwardPass.cs b/src/SharpInference.Engine/ForwardPass.cs index 3735318..c5d099f 100644 --- a/src/SharpInference.Engine/ForwardPass.cs +++ b/src/SharpInference.Engine/ForwardPass.cs @@ -468,7 +468,8 @@ static long F32Bytes(in TensorRef t) => _dequantCacheCovers = _dequantCacheEnabled && fullF32Bytes > 0 && _dequantCacheBudgetBytes >= fullF32Bytes; - PrefaultWeights(); + // SHARPI_PREFAULT=0 is the global kill switch (issue #221). + if (!MmapPrefault.IsDisabled()) PrefaultWeights(); } /// diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs index d0f8c0e..7ec3712 100644 --- a/src/SharpInference.Engine/HybridForwardPass.cs +++ b/src/SharpInference.Engine/HybridForwardPass.cs @@ -461,45 +461,11 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp _cpuKvCache = new KvCache(_nCpuLayers, _maxSeqLen, _numKvHeads, _headDim); - // Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor - // to ensure OS pages them into RAM before the first forward pass. - if (_nCpuLayers > 0) - { - Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages..."); - long touchSum = 0; - IEnumerable weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo); - if (_isMoE) - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGateInp!) - .Concat(_cpuWGateExps!) - .Concat(_cpuWUpExps!) - .Concat(_cpuWDownExps!); - if (_hasSharedExpert) - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGateShexp!) - .Concat(_cpuWUpShexp!) - .Concat(_cpuWDownShexp!); - } - } - else - { - weightsToTouch = weightsToTouch - .Concat(_cpuWGate) - .Concat(_cpuWUp) - .Concat(_cpuWDown); - } - - foreach (var wRef in weightsToTouch) - { - // Touch first and last cache line of each weight tensor - touchSum += wRef.DataPtr[0]; - long size = wRef.Info.ByteSize; - if (size > 64) touchSum += wRef.DataPtr[size - 1]; - } - Console.Error.WriteLine($" done. (touch={touchSum})"); - } + // Pre-fault every CPU-resident mmap weight page so the first request doesn't + // stall on demand paging (issue #221). The CPU embedding/output tensors are + // resolved even on a pure-GPU split (CPU embed lookup / lm_head), so the sweep + // is not gated on _nCpuLayers > 0. MmapPrefault honours SHARPI_PREFAULT. + MmapPrefault.Run("HybridForwardPass", BuildCpuPrefaultRegions()); if (_tqEnabled && _nCpuLayers > 0) { @@ -1165,6 +1131,59 @@ private CpuWeightRef ResolveCpuWeight(string name) return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info)); } + /// Collect every CPU-resident mmap weight region for the issue #221 + /// pre-fault sweep: the CPU embedding/output tensors plus the per-CPU-layer + /// attention/FFN weights (dense or MoE + shared experts). Biases and QK-norms are + /// excluded — they're dequantized into separate buffers, not read from the mmap. + private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions() + { + var regions = new List<(nint, long)>(); + void Add1(CpuWeightRef w) + { + if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize)); + } + void Add(CpuWeightRef[]? arr) + { + if (arr is null) return; + foreach (var w in arr) Add1(w); + } + + // Embedding/output mmap refs are read at inference only when they're NOT + // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the + // GiB-scale token_embd when it lives on the GPU avoids a large pointless read. + if (_gpuEmbedding is null) Add1(_cpuEmbedding); + if (_gpuOutputWeight is null) + { + Add1(_cpuOutputNorm); + if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias + } + + Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo); Add(_cpuFfnNorm); + + if (_isMoE) + { + Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps); + if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); } + + // GPU-trunk routed experts live in the GPU SLRU cache, but every cache miss + // spills to GpuMoeFfnCpuFallback, which reads blk.{0..nGpu-1}.ffn_*_exps + // straight from the mmap on the CPU. Fault those too so first-token misses + // don't stall (mirrors the CUDA class's _cpuMoe* coverage). + for (int li = 0; li < _nGpuLayers; li++) + { + Add1(ResolveCpuWeight($"blk.{li}.ffn_gate_exps.weight")); + Add1(ResolveCpuWeight($"blk.{li}.ffn_up_exps.weight")); + Add1(ResolveCpuWeight($"blk.{li}.ffn_down_exps.weight")); + } + } + else + { + Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown); + } + + return regions; + } + private float* LoadCpuBias(string name, int count) { var info = _model.FindTensor(name) diff --git a/src/SharpInference.Engine/HybridGdnForwardPass.cs b/src/SharpInference.Engine/HybridGdnForwardPass.cs index e7bdaba..0532fd8 100644 --- a/src/SharpInference.Engine/HybridGdnForwardPass.cs +++ b/src/SharpInference.Engine/HybridGdnForwardPass.cs @@ -673,7 +673,8 @@ public HybridGdnForwardPass(GgufModel model, IComputeBackend backend, ModelHyper } } - PrefaultWeights(); + // SHARPI_PREFAULT=0 is the global kill switch (issue #221). + if (!MmapPrefault.IsDisabled()) PrefaultWeights(); } // ============================================================ diff --git a/src/SharpInference.Engine/MmapPrefault.cs b/src/SharpInference.Engine/MmapPrefault.cs new file mode 100644 index 0000000..f333a44 --- /dev/null +++ b/src/SharpInference.Engine/MmapPrefault.cs @@ -0,0 +1,204 @@ +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace SharpInference.Engine; + +/// +/// Forces memory-mapped, CPU-resident weight pages resident before the first +/// forward pass, so the first request does not stall on demand paging (issue #221). +/// +/// The mmap'd GGUF is paged in lazily by the OS: without a warm-up, the first +/// request faults every weight page one at a time on the critical path, running ~5× +/// slower than warm steady-state on CPU-MoE configs (the same penalty #210's bench +/// protocol works around by running every cell twice). A parallel sequential sweep +/// pulls the same bytes in at memory/SSD bandwidth — a 15-20 GiB expert region warms +/// in seconds at NVMe speeds. +/// +/// Two-step warm-up: (1) a best-effort OS read-ahead hint +/// (PrefetchVirtualMemory on Windows, posix_madvise(WILLNEED) on Linux), +/// which coalesces the I/O; (2) a parallel per-page stride read that guarantees +/// residency and gives the wall-clock measurement. Correctness never depends on the +/// hint succeeding — the stride read is the source of truth. +/// +/// Tunable via SHARPI_PREFAULT: 0 disables all prefaulting, +/// 1 forces it on (bypassing the RAM-fit heuristic). Unset = auto. +/// +internal static unsafe partial class MmapPrefault +{ + /// RAM-fit policy applied in auto mode (env unset). + internal enum RamGate + { + /// Skip when the mapped set exceeds ~80% of available RAM — it would + /// just thrash. Used by the GPU-offload hybrid passes, whose CPU-resident + /// weights are a subset of the model that should comfortably fit. + FitsInRam, + + /// Always sweep (subject only to the SHARPI_PREFAULT=0 kill + /// switch). Used by the fully-CPU-resident passes, where the user has already + /// chosen to run the whole model from RAM and prefaulting is the point. + Always, + } + + internal readonly record struct Result(bool Ran, long Bytes, double Seconds, string Reason); + + /// True when SHARPI_PREFAULT=0 disables all prefaulting. Lets the + /// fully-CPU-resident passes share the one global kill switch without going through + /// . + internal static bool IsDisabled() => + Environment.GetEnvironmentVariable("SHARPI_PREFAULT") == "0"; + + /// + /// Pure gating decision, factored out so it can be unit-tested without touching + /// memory or the environment. is the raw + /// SHARPI_PREFAULT value (null = unset). + /// + internal static bool ShouldRun(string? mode, long totalBytes, long availRamBytes, + RamGate gate, out string reason) + { + if (totalBytes <= 0) { reason = "no mapped weights"; return false; } + if (mode == "0") { reason = "disabled (SHARPI_PREFAULT=0)"; return false; } + if (mode == "1") { reason = "forced (SHARPI_PREFAULT=1)"; return true; } + if (gate == RamGate.FitsInRam && availRamBytes > 0 && totalBytes > availRamBytes / 10 * 8) + { + reason = $"skipped: {totalBytes >> 20} MiB mapped exceeds 80% of " + + $"{availRamBytes >> 20} MiB RAM (set SHARPI_PREFAULT=1 to force)"; + return false; + } + reason = "auto"; + return true; + } + + /// + /// Pre-fault the given CPU-resident mmap regions. Null/zero-size regions are + /// ignored, so callers can pass their whole weight set and let the helper filter. + /// Logs the warmed size and rate (or the skip reason) to . + /// + internal static Result Run(string label, List<(nint Ptr, long Bytes)> regions, + RamGate gate = RamGate.FitsInRam) + { + long total = 0; + foreach (var (ptr, bytes) in regions) + if (ptr != 0 && bytes > 0) total += bytes; + + string? mode = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + long avail = GC.GetGCMemoryInfo().TotalAvailableMemoryBytes; + if (!ShouldRun(mode, total, avail, gate, out string reason)) + { + // Only announce a deliberate skip — staying silent on "nothing to do". + if (total > 0) Console.Error.WriteLine($"[{label}] Pre-fault {reason}."); + return new Result(false, total, 0, reason); + } + + long t0 = Stopwatch.GetTimestamp(); + TryAdvise(regions); // best-effort OS read-ahead hint + long touchSum = StrideRead(regions); // guaranteed residency + measurement + double secs = (Stopwatch.GetTimestamp() - t0) / (double)Stopwatch.Frequency; + + double gib = total / (1024.0 * 1024 * 1024); + double rate = secs > 1e-3 ? gib / secs : 0; + Console.Error.WriteLine( + $"[{label}] Pre-faulted {gib:F2} GiB of CPU-resident weights in {secs:F1}s ({rate:F1} GiB/s)."); + // Defeat dead-code elimination of the reads (touchSum is otherwise unused). + if (touchSum == long.MinValue) Console.Error.Write(touchSum); + return new Result(true, total, secs, reason); + } + + /// Parallel per-page stride read. Large regions are split into fixed-size + /// chunks so a handful of multi-GiB expert tensors still spread across all cores + /// rather than bottlenecking one thread per tensor. + private static long StrideRead(List<(nint Ptr, long Bytes)> regions) + { + int pageSize = Environment.SystemPageSize; + const long chunk = 32L * 1024 * 1024; + var jobs = new List<(nint Ptr, long Start, long End)>(); + foreach (var (ptr, bytes) in regions) + { + if (ptr == 0 || bytes <= 0) continue; + for (long s = 0; s < bytes; s += chunk) + jobs.Add((ptr, s, Math.Min(s + chunk, bytes))); + } + + long touchSum = 0; + Parallel.ForEach(jobs, job => + { + byte* p = (byte*)job.Ptr; + long localSum = 0; + for (long off = job.Start; off < job.End; off += pageSize) + localSum += p[off]; + localSum += p[job.End - 1]; // tail page of this chunk + Interlocked.Add(ref touchSum, localSum); + }); + return touchSum; + } + + /// Best-effort OS read-ahead hint. Wrapped so a P/Invoke failure (missing + /// symbol, unsupported platform) never derails the guaranteed stride read. + private static void TryAdvise(List<(nint Ptr, long Bytes)> regions) + { + try + { + if (OperatingSystem.IsWindows()) AdviseWindows(regions); + else if (OperatingSystem.IsLinux()) AdviseLinux(regions); + } + catch + { + // Ignored: the stride read below still forces residency. + } + } + + // ── Windows: PrefetchVirtualMemory ────────────────────────────────────── + private static void AdviseWindows(List<(nint Ptr, long Bytes)> regions) + { + int n = 0; + foreach (var (ptr, bytes) in regions) + if (ptr != 0 && bytes > 0) n++; + if (n == 0) return; + + var entries = new Win32MemoryRangeEntry[n]; + int j = 0; + foreach (var (ptr, bytes) in regions) + { + if (ptr == 0 || bytes <= 0) continue; + entries[j].VirtualAddress = ptr; + entries[j].NumberOfBytes = (nuint)bytes; + j++; + } + // Return value is intentionally ignored: this is an asynchronous, best-effort + // hint; the stride read is the residency guarantee. + fixed (Win32MemoryRangeEntry* e = entries) + _ = PrefetchVirtualMemory(GetCurrentProcess(), (nuint)n, e, 0); + } + + [StructLayout(LayoutKind.Sequential)] + private struct Win32MemoryRangeEntry + { + public nint VirtualAddress; + public nuint NumberOfBytes; + } + + [LibraryImport("kernel32.dll", EntryPoint = "PrefetchVirtualMemory")] + private static partial int PrefetchVirtualMemory(nint hProcess, nuint numberOfEntries, + Win32MemoryRangeEntry* virtualAddresses, uint flags); + + [LibraryImport("kernel32.dll", EntryPoint = "GetCurrentProcess")] + private static partial nint GetCurrentProcess(); + + // ── Linux: posix_madvise(WILLNEED) ────────────────────────────────────── + private static void AdviseLinux(List<(nint Ptr, long Bytes)> regions) + { + const int posixMadvWillNeed = 3; + long pageSize = Environment.SystemPageSize; + foreach (var (ptr, bytes) in regions) + { + if (ptr == 0 || bytes <= 0) continue; + // posix_madvise wants a page-aligned address; round down and extend length. + long addr = ptr; + long aligned = addr & ~(pageSize - 1); + long len = bytes + (addr - aligned); + _ = posix_madvise((nint)aligned, (nuint)len, posixMadvWillNeed); + } + } + + [LibraryImport("libc", EntryPoint = "posix_madvise")] + private static partial int posix_madvise(nint addr, nuint length, int advice); +} diff --git a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs new file mode 100644 index 0000000..93c290b --- /dev/null +++ b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs @@ -0,0 +1,202 @@ +using System.Runtime.InteropServices; +using SharpInference.Engine; + +namespace SharpInference.Tests.ForwardPass; + +/// +/// Unit tests for the issue #221 mmap pre-fault helper. The gating decision is tested +/// as a pure function (no memory, no environment); a handful of integration tests +/// exercise the real sweep over small buffers and the +/// SHARPI_PREFAULT kill switch / RAM-fit skip (which must bail out before touching the +/// claimed bytes — verified by claiming far more than is actually allocated). +/// +public sealed class MmapPrefaultTests +{ + private const long Gib = 1L << 30; + + // ── Pure gating decision ──────────────────────────────────────────────── + + [Fact] + public void ShouldRun_NoBytes_IsFalse() + { + Assert.False(MmapPrefault.ShouldRun(null, 0, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason)); + Assert.Contains("no mapped weights", reason); + } + + [Fact] + public void ShouldRun_ModeZero_IsDisabled() + { + Assert.False(MmapPrefault.ShouldRun("0", 4 * Gib, 64 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason)); + Assert.Contains("disabled", reason); + } + + [Fact] + public void ShouldRun_ModeOne_ForcesEvenWhenOverRam() + { + // Force bypasses the RAM-fit heuristic entirely. + Assert.True(MmapPrefault.ShouldRun("1", 100 * Gib, 8 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason)); + Assert.Contains("forced", reason); + } + + [Fact] + public void ShouldRun_Auto_FitsInRam_Runs() + { + Assert.True(MmapPrefault.ShouldRun(null, 4 * Gib, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out _)); + } + + [Fact] + public void ShouldRun_Auto_ExceedsEightyPercent_Skips() + { + // 14 GiB mapped > 80% of 16 GiB (= 12.8 GiB) → skip rather than thrash. + Assert.False(MmapPrefault.ShouldRun(null, 14 * Gib, 16 * Gib, MmapPrefault.RamGate.FitsInRam, out var reason)); + Assert.Contains("exceeds", reason); + } + + [Fact] + public void ShouldRun_Auto_ExactlyEightyPercent_Runs() + { + // Boundary: the gate uses a strict '>' so exactly 80% still runs. + long avail = 16 * Gib; + Assert.True(MmapPrefault.ShouldRun(null, avail / 10 * 8, avail, MmapPrefault.RamGate.FitsInRam, out _)); + } + + [Fact] + public void ShouldRun_AlwaysGate_IgnoresRamHeuristic() + { + // The fully-CPU-resident passes prefault regardless of the 80% threshold. + Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 16 * Gib, MmapPrefault.RamGate.Always, out _)); + } + + [Fact] + public void ShouldRun_UnknownRam_DoesNotSkip() + { + // availRamBytes <= 0 means "couldn't measure" — don't skip on a guess. + Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 0, MmapPrefault.RamGate.FitsInRam, out _)); + } + + // ── IsDisabled / env kill switch ──────────────────────────────────────── + + [Fact] + public void IsDisabled_ReflectsEnv() + { + var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + try + { + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0"); + Assert.True(MmapPrefault.IsDisabled()); + + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "1"); + Assert.False(MmapPrefault.IsDisabled()); + + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); + Assert.False(MmapPrefault.IsDisabled()); + } + finally + { + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); + } + } + + // ── Integration: real sweep over small buffers ────────────────────────── + + [Fact] + public unsafe void Run_SmallBuffers_FaultsAndReportsBytes() + { + var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); // auto + const long sizeA = 1 << 20; // 1 MiB (spans many pages + a chunk boundary downstream) + const long sizeB = 64 << 10; // 64 KiB + void* a = NativeMemory.Alloc((nuint)sizeA); + void* b = NativeMemory.Alloc((nuint)sizeB); + try + { + new Span(a, (int)sizeA).Fill(1); + new Span(b, (int)sizeB).Fill(2); + + var regions = new List<(nint, long)> { ((nint)a, sizeA), ((nint)b, sizeB) }; + var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always); + + Assert.True(result.Ran); + Assert.Equal(sizeA + sizeB, result.Bytes); + } + finally + { + NativeMemory.Free(a); + NativeMemory.Free(b); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); + } + } + + [Fact] + public unsafe void Run_NullAndZeroRegions_AreSkipped() + { + var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); + const long size = 4096; + void* a = NativeMemory.Alloc((nuint)size); + try + { + new Span(a, (int)size).Clear(); + var regions = new List<(nint, long)> + { + (0, size), // null ptr → ignored + ((nint)a, 0), // zero bytes → ignored + ((nint)a, size), // the only real region + }; + var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always); + + Assert.True(result.Ran); + Assert.Equal(size, result.Bytes); // only the valid region counts + } + finally + { + NativeMemory.Free(a); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); + } + } + + [Fact] + public unsafe void Run_Disabled_SkipsWithoutTouching() + { + var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0"); + // Tiny real allocation, but the region claims 1 TiB: if Run tried to stride-read + // it the process would fault. The kill switch must bail out before any access. + void* a = NativeMemory.Alloc(16); + try + { + var regions = new List<(nint, long)> { ((nint)a, 1L << 40) }; + var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.Always); + + Assert.False(result.Ran); + Assert.Contains("disabled", result.Reason); + } + finally + { + NativeMemory.Free(a); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); + } + } + + [Fact] + public unsafe void Run_AutoExceedsRam_SkipsWithoutTouching() + { + var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); // auto + void* a = NativeMemory.Alloc(16); + try + { + // 1 PiB claimed > 80% of any real machine's RAM → skipped before reading. + var regions = new List<(nint, long)> { ((nint)a, 1L << 50) }; + var result = MmapPrefault.Run("test", regions, MmapPrefault.RamGate.FitsInRam); + + Assert.False(result.Ran); + Assert.Contains("exceeds", result.Reason); + } + finally + { + NativeMemory.Free(a); + Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); + } + } +} From 761c917be586e0ffb9946c1d4b80e35b0bd9cbfe Mon Sep 17 00:00:00 2001 From: Pekka Heikura Date: Mon, 15 Jun 2026 15:41:39 +0300 Subject: [PATCH 2/2] review: address code-review findings on PR #257 - Migrate ForwardPass + HybridGdnForwardPass off their hand-rolled per-page sweeps onto MmapPrefault.Run(..., RamGate.Always). Makes RamGate.Always a live (not test-only) code path, unifies the SHARPI_PREFAULT kill switch + logging + OS read-ahead across all five passes, and removes the now-unused MmapPrefault.IsDisabled() (+ its test). Behaviour is unchanged: Always still sweeps the whole model unless SHARPI_PREFAULT=0. - Resolve GPU-trunk routed experts tolerantly (FindTensor, skip if absent) instead of ResolveCpuWeight, which threw -- a pre-fault must never make an otherwise-loadable model fail to load. - Cover the CUDA GPU-SLRU MoE path symmetrically with Vulkan: when experts stream through the SLRU (not CPU-MoE), fault blk.{0..nGpu}.ffn_*_exps so the first request's cache fills don't demand-page off disk. Smoke-tested: pure-CPU ForwardPass logs "Pre-faulted 0.74 GiB" and decodes coherently. 12 unit tests green; full Release build clean. Co-Authored-By: Claude Opus 4.8 --- .../CudaHybridForwardPass.cs | 30 ++++++++- src/SharpInference.Engine/ForwardPass.cs | 55 +++++++--------- .../HybridForwardPass.cs | 13 +++- .../HybridGdnForwardPass.cs | 63 +++++++++---------- src/SharpInference.Engine/MmapPrefault.cs | 6 -- .../MmapPrefaultTests.cs | 23 ------- 6 files changed, 93 insertions(+), 97 deletions(-) diff --git a/src/SharpInference.Engine/CudaHybridForwardPass.cs b/src/SharpInference.Engine/CudaHybridForwardPass.cs index 14479ba..8b3ccca 100644 --- a/src/SharpInference.Engine/CudaHybridForwardPass.cs +++ b/src/SharpInference.Engine/CudaHybridForwardPass.cs @@ -2765,6 +2765,13 @@ void Add(CpuWeightRef[]? arr) if (arr is null) return; foreach (var w in arr) Add1(w); } + // Resolve a tensor by name straight from the mmap, tolerating absence — a + // pre-fault must never make an otherwise-loadable model fail to load. + void AddByName(string name) + { + if (_model.FindTensor(name) is { } info) + regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize)); + } // Embedding/output mmap refs are read at inference only when they're NOT // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the @@ -2789,8 +2796,27 @@ void Add(CpuWeightRef[]? arr) Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown); } - // CPU-MoE routed experts for the GPU-trunk layers (the -g -1 cold-start cost). - Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps); + if (_isMoE && _nGpuLayers > 0) + { + if (_cpuMoe) + { + // CPU-MoE: GPU-trunk routed experts run on the CPU from these cached + // mmap refs every token (the -g -1 cold-start cost). + Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps); + } + else + { + // GPU-SLRU MoE: the routed experts aren't cached on the host, but the + // SLRU streams each one from the mmap on first use. Fault them so the + // first request's cache fills don't stall (mirrors the Vulkan path). + for (int li = 0; li < _nGpuLayers; li++) + { + AddByName($"blk.{li}.ffn_gate_exps.weight"); + AddByName($"blk.{li}.ffn_up_exps.weight"); + AddByName($"blk.{li}.ffn_down_exps.weight"); + } + } + } // Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections. if (_pleTokenEmbed is { } ple) Add1(ple); diff --git a/src/SharpInference.Engine/ForwardPass.cs b/src/SharpInference.Engine/ForwardPass.cs index c5d099f..ed0e303 100644 --- a/src/SharpInference.Engine/ForwardPass.cs +++ b/src/SharpInference.Engine/ForwardPass.cs @@ -468,8 +468,7 @@ static long F32Bytes(in TensorRef t) => _dequantCacheCovers = _dequantCacheEnabled && fullF32Bytes > 0 && _dequantCacheBudgetBytes >= fullF32Bytes; - // SHARPI_PREFAULT=0 is the global kill switch (issue #221). - if (!MmapPrefault.IsDisabled()) PrefaultWeights(); + PrefaultWeights(); } /// @@ -508,54 +507,48 @@ public static long MbToBudgetBytes(long mb) => : mb * 1024 * 1024; /// - /// Touch every 4KB page of all weight tensors to force OS page-in, - /// eliminating soft page faults during inference. + /// Pre-fault every weight page so the first request doesn't stall on demand paging + /// (issue #221). This is the fully-CPU pass — the whole model is mmap-resident, the + /// user chose to run it from RAM, so skips + /// the RAM-fit heuristic (subject only to the SHARPI_PREFAULT=0 kill switch). /// private void PrefaultWeights() { - var tensors = new List { _embTensor, _outputNorm, _outputWeight }; + var regions = new List<(nint, long)>(); + void Add(TensorRef t) + { + if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize)); + } + + Add(_embTensor); Add(_outputNorm); Add(_outputWeight); int L = _hp.NumLayers; for (int i = 0; i < L; i++) { bool kvShared = _layerKvSrc is not null && _layerKvSrc[i] >= 0; - tensors.Add(_attnNorm[i]); - tensors.Add(_wq[i]); tensors.Add(_wo[i]); - // k_eq_v global layers have no attn_v (_wv[i] is default/unset). - if (!kvShared) { tensors.Add(_wk[i]); if (_wv[i].DataPtr is not null) tensors.Add(_wv[i]); } - tensors.Add(_ffnNorm[i]); - if (_postAttnNorm is not null) tensors.Add(_postAttnNorm[i]); - if (_postFfwNorm is not null) tensors.Add(_postFfwNorm[i]); + Add(_attnNorm[i]); + Add(_wq[i]); Add(_wo[i]); + // k_eq_v global layers have no attn_v (_wv[i] is default/unset; Add skips null). + if (!kvShared) { Add(_wk[i]); Add(_wv[i]); } + Add(_ffnNorm[i]); + if (_postAttnNorm is not null) Add(_postAttnNorm[i]); + if (_postFfwNorm is not null) Add(_postFfwNorm[i]); if (_hp.IsMoE) { - tensors.Add(_wGateInp![i]); - tensors.Add(_wGateExps![i]); tensors.Add(_wUpExps![i]); tensors.Add(_wDownExps![i]); + Add(_wGateInp![i]); + Add(_wGateExps![i]); Add(_wUpExps![i]); Add(_wDownExps![i]); if (_hp.HasSharedExpert) { - tensors.Add(_wGateShexp![i]); tensors.Add(_wUpShexp![i]); tensors.Add(_wDownShexp![i]); + Add(_wGateShexp![i]); Add(_wUpShexp![i]); Add(_wDownShexp![i]); } } else { - tensors.Add(_wGate[i]); tensors.Add(_wUp[i]); tensors.Add(_wDown[i]); + Add(_wGate[i]); Add(_wUp[i]); Add(_wDown[i]); } } - long touchSum = 0; - Parallel.ForEach(tensors, tensor => - { - long size = tensor.Info.ByteSize; - byte* ptr = tensor.DataPtr; - long localSum = 0; - for (long off = 0; off < size; off += 4096) - localSum += ptr[off]; - if (size > 0) - localSum += ptr[size - 1]; - Interlocked.Add(ref touchSum, localSum); - }); - - // Prevent dead-code elimination - if (touchSum == long.MinValue) Console.Write(touchSum); + MmapPrefault.Run("ForwardPass", regions, MmapPrefault.RamGate.Always); } public PagedKvCache Cache => _kvCache; diff --git a/src/SharpInference.Engine/HybridForwardPass.cs b/src/SharpInference.Engine/HybridForwardPass.cs index 7ec3712..c1d5e4a 100644 --- a/src/SharpInference.Engine/HybridForwardPass.cs +++ b/src/SharpInference.Engine/HybridForwardPass.cs @@ -1147,6 +1147,13 @@ void Add(CpuWeightRef[]? arr) if (arr is null) return; foreach (var w in arr) Add1(w); } + // Resolve a tensor by name straight from the mmap, tolerating absence — a + // pre-fault must never make an otherwise-loadable model fail to load. + void AddByName(string name) + { + if (_model.FindTensor(name) is { } info) + regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize)); + } // Embedding/output mmap refs are read at inference only when they're NOT // uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the @@ -1171,9 +1178,9 @@ void Add(CpuWeightRef[]? arr) // don't stall (mirrors the CUDA class's _cpuMoe* coverage). for (int li = 0; li < _nGpuLayers; li++) { - Add1(ResolveCpuWeight($"blk.{li}.ffn_gate_exps.weight")); - Add1(ResolveCpuWeight($"blk.{li}.ffn_up_exps.weight")); - Add1(ResolveCpuWeight($"blk.{li}.ffn_down_exps.weight")); + AddByName($"blk.{li}.ffn_gate_exps.weight"); + AddByName($"blk.{li}.ffn_up_exps.weight"); + AddByName($"blk.{li}.ffn_down_exps.weight"); } } else diff --git a/src/SharpInference.Engine/HybridGdnForwardPass.cs b/src/SharpInference.Engine/HybridGdnForwardPass.cs index 0532fd8..6af6890 100644 --- a/src/SharpInference.Engine/HybridGdnForwardPass.cs +++ b/src/SharpInference.Engine/HybridGdnForwardPass.cs @@ -673,8 +673,7 @@ public HybridGdnForwardPass(GgufModel model, IComputeBackend backend, ModelHyper } } - // SHARPI_PREFAULT=0 is the global kill switch (issue #221). - if (!MmapPrefault.IsDisabled()) PrefaultWeights(); + PrefaultWeights(); } // ============================================================ @@ -2837,65 +2836,65 @@ private TensorRef ResolveTensor(string name) return new TensorRef(name, info, info.DType, _model.GetTensorDataPtr(info)); } + /// + /// Pre-fault every weight page so the first request doesn't stall on demand paging + /// (issue #221). The whole model is mmap-resident on this CPU/Vulkan GDN pass, so + /// skips the RAM-fit heuristic (subject only + /// to the SHARPI_PREFAULT=0 kill switch). + /// private void PrefaultWeights() { - var tensors = new List { _embTensor, _outputNorm, _outputWeight }; + var regions = new List<(nint, long)>(); + void Add(TensorRef t) + { + if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize)); + } + + Add(_embTensor); Add(_outputNorm); Add(_outputWeight); int L = _hp.NumLayers; for (int i = 0; i < L; i++) { - tensors.Add(_attnNorm[i]); - tensors.Add(_postAttnNorm[i]); + Add(_attnNorm[i]); + Add(_postAttnNorm[i]); if (_hp.IsMoE) { - tensors.Add(_wGateInp[i]); - tensors.Add(_wGateShexp[i]); tensors.Add(_wUpShexp[i]); tensors.Add(_wDownShexp[i]); - tensors.Add(_wGateExps[i]); tensors.Add(_wUpExps[i]); tensors.Add(_wDownExps[i]); + Add(_wGateInp[i]); + Add(_wGateShexp[i]); Add(_wUpShexp[i]); Add(_wDownShexp[i]); + Add(_wGateExps[i]); Add(_wUpExps[i]); Add(_wDownExps[i]); } else { - tensors.Add(_wFfnGate[i]); tensors.Add(_wFfnUp[i]); tensors.Add(_wFfnDown[i]); + Add(_wFfnGate[i]); Add(_wFfnUp[i]); Add(_wFfnDown[i]); } if (_hp.LayerTypes![i] == LayerType.Attention) { - tensors.Add(_wQGate[i]); tensors.Add(_wK[i]); tensors.Add(_wV[i]); tensors.Add(_wO[i]); + Add(_wQGate[i]); Add(_wK[i]); Add(_wV[i]); Add(_wO[i]); } else { - tensors.Add(_wQkv[i]); tensors.Add(_wZGate[i]); tensors.Add(_ssmOut[i]); - tensors.Add(_ssmAlpha[i]); tensors.Add(_ssmBeta[i]); + Add(_wQkv[i]); Add(_wZGate[i]); Add(_ssmOut[i]); + Add(_ssmAlpha[i]); Add(_ssmBeta[i]); } } if (_hasMtp) { - tensors.Add(_mtpAttnNorm); - tensors.Add(_mtpWQGate); tensors.Add(_mtpWK); tensors.Add(_mtpWV); tensors.Add(_mtpWO); - tensors.Add(_mtpPostAttnNorm); + Add(_mtpAttnNorm); + Add(_mtpWQGate); Add(_mtpWK); Add(_mtpWV); Add(_mtpWO); + Add(_mtpPostAttnNorm); if (_mtpIsMoE) { - tensors.Add(_mtpWGateInp); - tensors.Add(_mtpWGateShexp); tensors.Add(_mtpWUpShexp); tensors.Add(_mtpWDownShexp); - tensors.Add(_mtpWGateExps); tensors.Add(_mtpWUpExps); tensors.Add(_mtpWDownExps); + Add(_mtpWGateInp); + Add(_mtpWGateShexp); Add(_mtpWUpShexp); Add(_mtpWDownShexp); + Add(_mtpWGateExps); Add(_mtpWUpExps); Add(_mtpWDownExps); } else { - tensors.Add(_mtpFfnGate); tensors.Add(_mtpFfnUp); tensors.Add(_mtpFfnDown); + Add(_mtpFfnGate); Add(_mtpFfnUp); Add(_mtpFfnDown); } } - long touchSum = 0; - Parallel.ForEach(tensors, tensor => - { - long size = tensor.Info.ByteSize; - byte* ptr = tensor.DataPtr; - long localSum = 0; - for (long off = 0; off < size; off += 4096) - localSum += ptr[off]; - if (size > 0) - localSum += ptr[size - 1]; - Interlocked.Add(ref touchSum, localSum); - }); - if (touchSum == long.MinValue) Console.Write(touchSum); + MmapPrefault.Run("HybridGdnForwardPass", regions, MmapPrefault.RamGate.Always); } // ============================================================ diff --git a/src/SharpInference.Engine/MmapPrefault.cs b/src/SharpInference.Engine/MmapPrefault.cs index f333a44..d3dcc37 100644 --- a/src/SharpInference.Engine/MmapPrefault.cs +++ b/src/SharpInference.Engine/MmapPrefault.cs @@ -41,12 +41,6 @@ internal enum RamGate internal readonly record struct Result(bool Ran, long Bytes, double Seconds, string Reason); - /// True when SHARPI_PREFAULT=0 disables all prefaulting. Lets the - /// fully-CPU-resident passes share the one global kill switch without going through - /// . - internal static bool IsDisabled() => - Environment.GetEnvironmentVariable("SHARPI_PREFAULT") == "0"; - /// /// Pure gating decision, factored out so it can be unit-tested without touching /// memory or the environment. is the raw diff --git a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs index 93c290b..09b3456 100644 --- a/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs +++ b/tests/SharpInference.Tests.ForwardPass/MmapPrefaultTests.cs @@ -74,29 +74,6 @@ public void ShouldRun_UnknownRam_DoesNotSkip() Assert.True(MmapPrefault.ShouldRun(null, 100 * Gib, 0, MmapPrefault.RamGate.FitsInRam, out _)); } - // ── IsDisabled / env kill switch ──────────────────────────────────────── - - [Fact] - public void IsDisabled_ReflectsEnv() - { - var prev = Environment.GetEnvironmentVariable("SHARPI_PREFAULT"); - try - { - Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "0"); - Assert.True(MmapPrefault.IsDisabled()); - - Environment.SetEnvironmentVariable("SHARPI_PREFAULT", "1"); - Assert.False(MmapPrefault.IsDisabled()); - - Environment.SetEnvironmentVariable("SHARPI_PREFAULT", null); - Assert.False(MmapPrefault.IsDisabled()); - } - finally - { - Environment.SetEnvironmentVariable("SHARPI_PREFAULT", prev); - } - } - // ── Integration: real sweep over small buffers ────────────────────────── [Fact]