Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 85 additions & 42 deletions src/SharpInference.Engine/CudaHybridForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -930,48 +930,12 @@ void TraceVram(string label)
_gpuRopeFreqs = UploadWeight("rope_freqs.weight");
}

// Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
// to ensure OS pages them into RAM before the first forward pass.
if (_nCpuLayers > 0)
{
Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
long touchSum = 0;
IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
if (_isMoE)
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGateInp!)
.Concat(_cpuWGateExps!)
.Concat(_cpuWUpExps!)
.Concat(_cpuWDownExps!);
if (_hasSharedExpert)
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGateShexp!)
.Concat(_cpuWUpShexp!)
.Concat(_cpuWDownShexp!);
}
}
else
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGate)
.Concat(_cpuWUp)
.Concat(_cpuWDown);
}

foreach (var wRef in weightsToTouch)
{
// Skip un-resolved slots — KV-share layers on Gemma 4 leave attn_k /
// attn_v unresolved by design (the source layer's projections are
// reused via the alias dispatch).
if (wRef.DataPtr == null) continue;
touchSum += wRef.DataPtr[0];
long size = wRef.Info.ByteSize;
if (size > 64) touchSum += wRef.DataPtr[size - 1];
}
Console.Error.WriteLine($" done. (touch={touchSum})");
}
// Pre-fault every CPU-resident mmap weight page so the first request doesn't
// stall on demand paging (issue #221). NOT gated on _nCpuLayers > 0: the
// CPU-MoE routed experts and the Gemma 4 PLE table are CPU-resident even when
// every transformer layer is GPU-offloaded (-g -1), and those are the dominant
// cold-start cost. MmapPrefault filters empty configs and honours SHARPI_PREFAULT.
MmapPrefault.Run("CudaHybridForwardPass", BuildCpuPrefaultRegions());

if (_tqEnabled && _nCpuLayers > 0)
{
Expand Down Expand Up @@ -2783,6 +2747,85 @@ private CpuWeightRef ResolveCpuWeight(string name)
return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
}

/// <summary>Collect every CPU-resident mmap weight region for the issue #221
/// pre-fault sweep. Covers per-CPU-layer weights, the CPU-MoE routed experts (the
/// big cold-start cost, present even with all layers GPU-offloaded), and the
/// Gemma 4 PLE table. Unresolved slots (null <c>DataPtr</c>) — e.g. Gemma 4
/// KV-share / k==v layers — are skipped. Biases and QK-norms are excluded: they're
/// dequantized into separate buffers, not read from the mmap at inference time.</summary>
private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
{
var regions = new List<(nint, long)>();
void Add1(CpuWeightRef w)
{
if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
}
void Add(CpuWeightRef[]? arr)
{
if (arr is null) return;
foreach (var w in arr) Add1(w);
}
// Resolve a tensor by name straight from the mmap, tolerating absence — a
// pre-fault must never make an otherwise-loadable model fail to load.
void AddByName(string name)
{
if (_model.FindTensor(name) is { } info)
regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
}

// Embedding/output mmap refs are read at inference only when they're NOT
// uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
// GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
if (_gpuEmbedding is null) Add1(_cpuEmbedding);
if (_gpuOutputWeight is null)
{
Add1(_cpuOutputNorm);
if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
}

Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo);
Add(_cpuFfnNorm); Add(_cpuPostAttnNorm); Add(_cpuPostFfwNorm);

if (_isMoE)
{
Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }
}
else
{
Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
}

if (_isMoE && _nGpuLayers > 0)
{
if (_cpuMoe)
{
// CPU-MoE: GPU-trunk routed experts run on the CPU from these cached
// mmap refs every token (the -g -1 cold-start cost).
Add(_cpuMoeGateInp); Add(_cpuMoeGateExps); Add(_cpuMoeUpExps); Add(_cpuMoeDownExps);
}
else
{
// GPU-SLRU MoE: the routed experts aren't cached on the host, but the
// SLRU streams each one from the mmap on first use. Fault them so the
// first request's cache fills don't stall (mirrors the Vulkan path).
for (int li = 0; li < _nGpuLayers; li++)
{
AddByName($"blk.{li}.ffn_gate_exps.weight");
AddByName($"blk.{li}.ffn_up_exps.weight");
AddByName($"blk.{li}.ffn_down_exps.weight");
}
}
}

// Gemma 4 PLE: the GiB-scale per-layer token-embedding table + per-layer projections.
if (_pleTokenEmbed is { } ple) Add1(ple);
if (_perLayerProjNorm is { } pln) Add1(pln);
Add(_cpuInpGate); Add(_cpuPleProj); Add(_cpuPlePostNorm);

return regions;
}

private float* LoadCpuBias(string name, int count)
{
var info = _model.FindTensor(name)
Expand Down
41 changes: 41 additions & 0 deletions src/SharpInference.Engine/CudaHybridGdnForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,13 @@ void TraceVram(string label)
_logitsBuf2 = Array.Empty<float>();
_cpuNormBuf2 = _cpuMoeHidden2 = _lastHiddenT1 = null;
}

// Pre-fault CPU-resident mmap weight pages (issue #221). On the CPU-MoE config
// (the auto-selected winner on 12 GB) the routed experts / dense FFN weights are
// paged in lazily; without this the first request faults them all on the critical
// path, ~5× slower than warm. MmapPrefault honours SHARPI_PREFAULT and the
// RAM-fit heuristic, and no-ops when nothing is CPU-resident (full-GPU GDN).
MmapPrefault.Run("CudaHybridGdnForwardPass", BuildCpuPrefaultRegions());
}

// =================================================================
Expand Down Expand Up @@ -5231,6 +5238,40 @@ private CpuWeightRef ResolveCpuWeight(string name)
return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
}

/// <summary>Collect every CPU-resident mmap weight region for the issue #221
/// pre-fault sweep: the CPU-MoE routed experts (or dense FFN weights), the
/// SHARPI_CPU_GDN debug GDN weights, and the MoE-MTP head experts. Arrays that
/// aren't allocated for this config are null; unpopulated slots (e.g. the GDN
/// arrays when not in CPU-GDN mode) have a null <c>DataPtr</c> and are skipped.
/// Everything dequantized via LoadF32Tensor/LoadConv1d lives in separate buffers,
/// not the mmap, and is excluded.</summary>
private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
{
var regions = new List<(nint, long)>();
void Add1(CpuWeightRef w)
{
if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
}
void Add(CpuWeightRef[]? arr)
{
if (arr is null) return;
foreach (var w in arr) Add1(w);
}

// Trunk: CPU-MoE routed experts, or dense FFN weights (Qwen3.6-27B-MTP).
Add(_cpuFfnGateInp); Add(_cpuFfnGateExps); Add(_cpuFfnUpExps); Add(_cpuFfnDownExps);
Add(_cpuWFfnGate); Add(_cpuWFfnUp); Add(_cpuWFfnDown);

// SHARPI_CPU_GDN=1 debug path (arrays always allocated, populated only then).
Add(_cpuWQkv); Add(_cpuWZGate); Add(_cpuSsmOut); Add(_cpuSsmAlpha); Add(_cpuSsmBeta);

// MoE-MTP head routed experts (one extra layer; null DataPtr when absent).
Add1(_cpuMtpFfnGateInp); Add1(_cpuMtpFfnGateExps);
Add1(_cpuMtpFfnUpExps); Add1(_cpuMtpFfnDownExps);

return regions;
}

private Tensor UploadWeight(string name)
{
var info = _model.FindTensor(name)
Expand Down
52 changes: 23 additions & 29 deletions src/SharpInference.Engine/ForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -507,54 +507,48 @@ public static long MbToBudgetBytes(long mb) =>
: mb * 1024 * 1024;

/// <summary>
/// Touch every 4KB page of all weight tensors to force OS page-in,
/// eliminating soft page faults during inference.
/// Pre-fault every weight page so the first request doesn't stall on demand paging
/// (issue #221). This is the fully-CPU pass — the whole model is mmap-resident, the
/// user chose to run it from RAM, so <see cref="MmapPrefault.RamGate.Always"/> skips
/// the RAM-fit heuristic (subject only to the <c>SHARPI_PREFAULT=0</c> kill switch).
/// </summary>
private void PrefaultWeights()
{
var tensors = new List<TensorRef> { _embTensor, _outputNorm, _outputWeight };
var regions = new List<(nint, long)>();
void Add(TensorRef t)
{
if (t.DataPtr != null) regions.Add(((nint)t.DataPtr, t.Info.ByteSize));
}

Add(_embTensor); Add(_outputNorm); Add(_outputWeight);
int L = _hp.NumLayers;
for (int i = 0; i < L; i++)
{
bool kvShared = _layerKvSrc is not null && _layerKvSrc[i] >= 0;
tensors.Add(_attnNorm[i]);
tensors.Add(_wq[i]); tensors.Add(_wo[i]);
// k_eq_v global layers have no attn_v (_wv[i] is default/unset).
if (!kvShared) { tensors.Add(_wk[i]); if (_wv[i].DataPtr is not null) tensors.Add(_wv[i]); }
tensors.Add(_ffnNorm[i]);
if (_postAttnNorm is not null) tensors.Add(_postAttnNorm[i]);
if (_postFfwNorm is not null) tensors.Add(_postFfwNorm[i]);
Add(_attnNorm[i]);
Add(_wq[i]); Add(_wo[i]);
// k_eq_v global layers have no attn_v (_wv[i] is default/unset; Add skips null).
if (!kvShared) { Add(_wk[i]); Add(_wv[i]); }
Add(_ffnNorm[i]);
if (_postAttnNorm is not null) Add(_postAttnNorm[i]);
if (_postFfwNorm is not null) Add(_postFfwNorm[i]);

if (_hp.IsMoE)
{
tensors.Add(_wGateInp![i]);
tensors.Add(_wGateExps![i]); tensors.Add(_wUpExps![i]); tensors.Add(_wDownExps![i]);
Add(_wGateInp![i]);
Add(_wGateExps![i]); Add(_wUpExps![i]); Add(_wDownExps![i]);
if (_hp.HasSharedExpert)
{
tensors.Add(_wGateShexp![i]); tensors.Add(_wUpShexp![i]); tensors.Add(_wDownShexp![i]);
Add(_wGateShexp![i]); Add(_wUpShexp![i]); Add(_wDownShexp![i]);
}
}
else
{
tensors.Add(_wGate[i]); tensors.Add(_wUp[i]); tensors.Add(_wDown[i]);
Add(_wGate[i]); Add(_wUp[i]); Add(_wDown[i]);
}
}

long touchSum = 0;
Parallel.ForEach(tensors, tensor =>
{
long size = tensor.Info.ByteSize;
byte* ptr = tensor.DataPtr;
long localSum = 0;
for (long off = 0; off < size; off += 4096)
localSum += ptr[off];
if (size > 0)
localSum += ptr[size - 1];
Interlocked.Add(ref touchSum, localSum);
});

// Prevent dead-code elimination
if (touchSum == long.MinValue) Console.Write(touchSum);
MmapPrefault.Run("ForwardPass", regions, MmapPrefault.RamGate.Always);
}

public PagedKvCache Cache => _kvCache;
Expand Down
104 changes: 65 additions & 39 deletions src/SharpInference.Engine/HybridForwardPass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -461,45 +461,11 @@ public HybridForwardPass(GgufModel model, VulkanBackend gpu, ModelHyperparams hp

_cpuKvCache = new KvCache(_nCpuLayers, _maxSeqLen, _numKvHeads, _headDim);

// Pre-fault mmap pages for CPU layers: touch the first byte of each weight tensor
// to ensure OS pages them into RAM before the first forward pass.
if (_nCpuLayers > 0)
{
Console.Error.Write($"[HybridForwardPass] Pre-faulting CPU weight pages...");
long touchSum = 0;
IEnumerable<CpuWeightRef> weightsToTouch = _cpuWq.Concat(_cpuWk).Concat(_cpuWv).Concat(_cpuWo);
if (_isMoE)
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGateInp!)
.Concat(_cpuWGateExps!)
.Concat(_cpuWUpExps!)
.Concat(_cpuWDownExps!);
if (_hasSharedExpert)
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGateShexp!)
.Concat(_cpuWUpShexp!)
.Concat(_cpuWDownShexp!);
}
}
else
{
weightsToTouch = weightsToTouch
.Concat(_cpuWGate)
.Concat(_cpuWUp)
.Concat(_cpuWDown);
}

foreach (var wRef in weightsToTouch)
{
// Touch first and last cache line of each weight tensor
touchSum += wRef.DataPtr[0];
long size = wRef.Info.ByteSize;
if (size > 64) touchSum += wRef.DataPtr[size - 1];
}
Console.Error.WriteLine($" done. (touch={touchSum})");
}
// Pre-fault every CPU-resident mmap weight page so the first request doesn't
// stall on demand paging (issue #221). The CPU embedding/output tensors are
// resolved even on a pure-GPU split (CPU embed lookup / lm_head), so the sweep
// is not gated on _nCpuLayers > 0. MmapPrefault honours SHARPI_PREFAULT.
MmapPrefault.Run("HybridForwardPass", BuildCpuPrefaultRegions());

if (_tqEnabled && _nCpuLayers > 0)
{
Expand Down Expand Up @@ -1165,6 +1131,66 @@ private CpuWeightRef ResolveCpuWeight(string name)
return new CpuWeightRef(name, info, info.DType, _model.GetTensorDataPtr(info));
}

/// <summary>Collect every CPU-resident mmap weight region for the issue #221
/// pre-fault sweep: the CPU embedding/output tensors plus the per-CPU-layer
/// attention/FFN weights (dense or MoE + shared experts). Biases and QK-norms are
/// excluded — they're dequantized into separate buffers, not read from the mmap.</summary>
private List<(nint Ptr, long Bytes)> BuildCpuPrefaultRegions()
{
var regions = new List<(nint, long)>();
void Add1(CpuWeightRef w)
{
if (w.DataPtr != null) regions.Add(((nint)w.DataPtr, w.Info.ByteSize));
}
void Add(CpuWeightRef[]? arr)
{
if (arr is null) return;
foreach (var w in arr) Add1(w);
}
// Resolve a tensor by name straight from the mmap, tolerating absence — a
// pre-fault must never make an otherwise-loadable model fail to load.
void AddByName(string name)
{
if (_model.FindTensor(name) is { } info)
regions.Add(((nint)_model.GetTensorDataPtr(info), info.ByteSize));
}

// Embedding/output mmap refs are read at inference only when they're NOT
// uploaded to VRAM (_gpu* null == cpuEmbeddingOutputOnly). Skipping the
// GiB-scale token_embd when it lives on the GPU avoids a large pointless read.
if (_gpuEmbedding is null) Add1(_cpuEmbedding);
if (_gpuOutputWeight is null)
{
Add1(_cpuOutputNorm);
if (_cpuOutputWeight.DataPtr != _cpuEmbedding.DataPtr) Add1(_cpuOutputWeight); // tied weights alias
}

Add(_cpuAttnNorm); Add(_cpuWq); Add(_cpuWk); Add(_cpuWv); Add(_cpuWo); Add(_cpuFfnNorm);

if (_isMoE)
{
Add(_cpuWGateInp); Add(_cpuWGateExps); Add(_cpuWUpExps); Add(_cpuWDownExps);
if (_hasSharedExpert) { Add(_cpuWGateShexp); Add(_cpuWUpShexp); Add(_cpuWDownShexp); }

// GPU-trunk routed experts live in the GPU SLRU cache, but every cache miss
// spills to GpuMoeFfnCpuFallback, which reads blk.{0..nGpu-1}.ffn_*_exps
// straight from the mmap on the CPU. Fault those too so first-token misses
// don't stall (mirrors the CUDA class's _cpuMoe* coverage).
for (int li = 0; li < _nGpuLayers; li++)
{
AddByName($"blk.{li}.ffn_gate_exps.weight");
AddByName($"blk.{li}.ffn_up_exps.weight");
AddByName($"blk.{li}.ffn_down_exps.weight");
}
}
else
{
Add(_cpuWGate); Add(_cpuWUp); Add(_cpuWDown);
}

return regions;
}

private float* LoadCpuBias(string name, int count)
{
var info = _model.FindTensor(name)
Expand Down
Loading