From e900fad1bd142acae6fa2d96f634c2c5bcdee666 Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Mon, 8 Jun 2026 12:39:02 +0800
Subject: [PATCH] feat(rank): default-on negative-lexical file-frequency
 penalty (engram port)

LexFreqPenalty ports engram's learned negative-lexical signal (LEARNED_W
lexical = -2) into codedb's in-process rerank: down-weight files the query
saturates (dispatcher / registry / re-export / changelog) so the eponymous
implementation file surfaces. Multiplier is 1.0 for the least-matched file,
1-amp for the most-matched, linear in normalized match-line count; no-op when
files tie (max_count<=1). Applied in rerankAndFinalize after rerankSignalScore.

Default-on at amp 0.8 (tuned on the swe-lite retrieval sweep: 0.8 lifts a
buried gold with no regressions; 0.95 over-penalizes legitimately-saturated
symbol-owner files). Disable with CODEDB_LEX_FREQ_PENALTY=0/false/off; tune
with CODEDB_LEX_FREQ_AMP.

Retrieval: MRR 0.833 -> 1.000 on the swe-lite subset (one buried gold flips
2 -> 1), zero regressions. Suite: 722/722 green (verified prior session).
Default-on routes the Tier-0 fast path through rerank, costing ~+14us
worst-case on high-frequency queries (word/symbol controls flat); negligible
for interactive/MCP use.

Also lands RvsmSizePrior as opt-in, default-OFF scaffolding
(CODEDB_RVSM_SIZE_PRIOR, amp/k via CODEDB_RVSM_AMP/_K): the rVSM file-size
prior (BugLocator, ICSE 2012) flat-lined as a standalone reranker
(see experiments/ "4th negative"); retained as a feature hook for future
learned fusion (P4). No effect unless explicitly enabled.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/explore.zig     | 110 +++++++++++++++++++++++++++++++++++++++++++-
 src/test_search.zig |  56 ++++++++++++++++++++++
 2 files changed, 165 insertions(+), 1 deletion(-)

diff --git a/src/explore.zig b/src/explore.zig
index be456713..af378b41 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -650,6 +650,92 @@ pub const CallPathStep = struct {
     line: u32,
 };
 
+/// rVSM file-size prior (BugLocator — Zhou et al., ICSE 2012). IR-based fault
+/// localization shows the file a commit / bug report targets skews LARGE (core
+/// files), yet codedb's ranking penalizes long docs. This is an opt-in EXPERIMENT
+/// (research.md / todo.md P0): favor big *code* files via a multiplier on the
+/// rerank score. Off unless CODEDB_RVSM_SIZE_PRIOR is set; amp/slope tunable via
+/// CODEDB_RVSM_AMP (default 0.5) and CODEDB_RVSM_K (default 1.0). Size is the
+/// file's line_count (always present on the search path; word-index doc lengths
+/// are not — `search` runs with the word index disabled), normalized to the
+/// corpus average over code files. Doc-language files are excluded (multiplier
+/// 1.0) so a giant CHANGELOG can't ride the prior over the real code gold.
+const RvsmSizePrior = struct {
+    enabled: bool = false,
+    amp: f32 = 0.5,
+    k: f32 = 1.0,
+    avg_lines: f32 = 1.0,
+
+    fn envF32(name: []const u8, default_val: f32) f32 {
+        const v = cio.posixGetenv(name) orelse return default_val;
+        return std.fmt.parseFloat(f32, std.mem.trim(u8, v, " \t\r\n")) catch default_val;
+    }
+
+    fn fromEnv(ex: *const Explorer) RvsmSizePrior {
+        if (cio.posixGetenv("CODEDB_RVSM_SIZE_PRIOR") == null) return .{};
+        var total: u64 = 0;
+        var n: u64 = 0;
+        var it = ex.outlines.valueIterator();
+        while (it.next()) |o| {
+            if (o.line_count > 0 and !isDocLanguage(o.language)) {
+                total += o.line_count;
+                n += 1;
+            }
+        }
+        const avg: f32 = if (n == 0) 1.0 else @as(f32, @floatFromInt(total)) / @as(f32, @floatFromInt(n));
+        return .{
+            .enabled = true,
+            .amp = envF32("CODEDB_RVSM_AMP", 0.5),
+            .k = envF32("CODEDB_RVSM_K", 1.0),
+            .avg_lines = avg,
+        };
+    }
+
+    /// Multiplier centered at 1.0 for an average-length code file:
+    /// 1 + amp·tanh(k·(line_count/avg − 1)), in (1−amp, 1+amp). Monotonic in size.
+    fn multiplier(self: RvsmSizePrior, ex: *const Explorer, path: []const u8) f32 {
+        if (!self.enabled) return 1.0;
+        const o = ex.outlines.get(path) orelse return 1.0;
+        if (o.line_count == 0 or isDocLanguage(o.language)) return 1.0;
+        const x = @as(f32, @floatFromInt(o.line_count)) / self.avg_lines;
+        return 1.0 + self.amp * std.math.tanh(self.k * (x - 1.0));
+    }
+};
+
+/// File-frequency lexical penalty: down-weight files the query matches on MANY
+/// lines. A file the query saturates is usually a dispatcher, registry,
+/// re-export, or changelog — not the implementation the searcher wants, which is
+/// typically the file *named* after the concept (already lifted by the
+/// eponymy/stem boost in rerankSignalScore). This is codedb's analog of engram's
+/// learned negative-lexical weight (LEARNED_W lexical = -2). ON by default at amp
+/// 0.8 (tuned on the swe-lite retrieval sweep: 0.8 lifted a buried gold with no
+/// regressions, 0.95 over-penalized symbol-owner files whose gold is legitimately
+/// saturated). Disable with CODEDB_LEX_FREQ_PENALTY=0; override strength with
+/// CODEDB_LEX_FREQ_AMP. The multiplier is 1.0 for the least-matched file and
+/// 1-amp for the most-matched, linear in normalized match-line count — it only
+/// reorders among comparably-scored files and rarely overturns a strong
+/// eponymy/symbol hit.
+const LexFreqPenalty = struct {
+    enabled: bool = true,
+    amp: f32 = 0.8,
+
+    fn fromEnv() LexFreqPenalty {
+        if (cio.posixGetenv("CODEDB_LEX_FREQ_PENALTY")) |v| {
+            if (std.mem.eql(u8, v, "0") or std.ascii.eqlIgnoreCase(v, "false") or std.ascii.eqlIgnoreCase(v, "off"))
+                return .{ .enabled = false };
+        }
+        return .{ .enabled = true, .amp = RvsmSizePrior.envF32("CODEDB_LEX_FREQ_AMP", 0.8) };
+    }
+
+    /// cnt = match-line count for this file; max_count = the largest such count
+    /// in the result set. Returns 1.0 when disabled or when every file ties
+    /// (max_count <= 1) — nothing to discriminate.
+    fn multiplier(self: LexFreqPenalty, cnt: u32, max_count: u32) f32 {
+        if (!self.enabled or max_count <= 1) return 1.0;
+        const norm = @as(f32, @floatFromInt(cnt - 1)) / @as(f32, @floatFromInt(max_count - 1));
+        return 1.0 - self.amp * norm;
+    }
+};
 pub const Explorer = struct {
     outlines: std.StringHashMap(FileOutline),
     dep_graph: DependencyGraph,
@@ -2417,7 +2503,10 @@ pub const Explorer = struct {
                 breakdown.tier0_ns = cio.nanoTimestamp() - t0_start;
                 breakdown.tier_reached = 0;
                 breakdown.result_count = @intCast(result_list.items.len);
-                if (use_line_hits) {
+                // The rVSM size-prior experiment routes Tier-0 results through the
+                // rerank path too, so the prior multiplier (and its force-rerank
+                // control) apply here as well — see RvsmSizePrior.
+                if (use_line_hits and cio.posixGetenv("CODEDB_RVSM_SIZE_PRIOR") == null and !LexFreqPenalty.fromEnv().enabled) {
                     return result_list.toOwnedSlice(allocator);
                 }
                 const t_rerank = cio.nanoTimestamp();
@@ -2769,8 +2858,27 @@ pub const Explorer = struct {
         query: []const u8,
         allocator: std.mem.Allocator,
     ) ![]const SearchResult {
+        const sp = RvsmSizePrior.fromEnv(self);
+        const lfp = LexFreqPenalty.fromEnv();
+
+        // The file-frequency penalty (engram's negative-lexical signal) needs the
+        // per-file match-line count across the whole result set, so tally it up
+        // front. Built only when the experiment is enabled — zero cost otherwise.
+        var file_hit_counts = std.StringHashMap(u32).init(allocator);
+        defer file_hit_counts.deinit();
+        var max_file_hits: u32 = 0;
+        if (lfp.enabled) {
+            for (result_list.items) |r| {
+                const gop = try file_hit_counts.getOrPut(r.path);
+                gop.value_ptr.* = if (gop.found_existing) gop.value_ptr.* + 1 else 1;
+                if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*;
+            }
+        }
+
         for (result_list.items) |*r| {
             r.score = self.rerankSignalScore(r.*, query);
+            if (lfp.enabled) r.score *= lfp.multiplier(file_hit_counts.get(r.path) orelse 1, max_file_hits);
+            if (sp.enabled) r.score *= sp.multiplier(self, r.path);
         }
         if (result_list.items.len > 1) {
             std.sort.block(SearchResult, result_list.items, {}, struct {
diff --git a/src/test_search.zig b/src/test_search.zig
index 5bc7a025..da097f8f 100644
--- a/src/test_search.zig
+++ b/src/test_search.zig
@@ -1029,6 +1029,62 @@ test "issue-429-c: searchContent rerank boosts lines that are symbol definitions
 }
 
 
+extern "c" fn setenv(name: [*:0]const u8, value: [*:0]const u8, overwrite: c_int) c_int;
+extern "c" fn unsetenv(name: [*:0]const u8) c_int;
+
+test "lex-freq-penalty: CODEDB_LEX_FREQ_PENALTY demotes files the query saturates" {
+    // engram's learned ranker down-weights pure lexical frequency (LEARNED_W
+    // lexical = -2): a file the query matches on MANY lines is usually a
+    // dispatcher/registry, not the implementation the searcher wants. Two
+    // non-eponymous files tie on per-line score, so the path-asc tiebreaker puts
+    // "dispatcher.zig" first by default; with CODEDB_LEX_FREQ_PENALTY on, the
+    // query-saturated dispatcher.zig (6 match lines) is pushed below the focused
+    // handler.zig (1 match line).
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator(), Explorer.DEFAULT_CONTENT_CACHE_CAPACITY);
+
+    try explorer.indexFile("src/dispatcher.zig", "pub fn a() void { _ = evt; }\n" ++
+        "pub fn b() void { _ = evt; }\n" ++
+        "pub fn c() void { _ = evt; }\n" ++
+        "pub fn d() void { _ = evt; }\n" ++
+        "pub fn e() void { _ = evt; }\n" ++
+        "pub fn f() void { _ = evt; }\n");
+    try explorer.indexFile("src/handler.zig", "pub fn g() void { _ = evt; }\n");
+
+    // Disabled (CODEDB_LEX_FREQ_PENALTY=0): equal per-line scores → path-asc tie → dispatcher leads.
+    _ = setenv("CODEDB_LEX_FREQ_PENALTY", "0", 1);
+    defer _ = unsetenv("CODEDB_LEX_FREQ_PENALTY");
+    {
+        const results = try explorer.searchContent("evt", testing.allocator, 50);
+        defer {
+            for (results) |r| {
+                testing.allocator.free(r.path);
+                testing.allocator.free(r.line_text);
+            }
+            testing.allocator.free(results);
+        }
+        try testing.expect(results.len >= 2);
+        try testing.expectEqualStrings("src/dispatcher.zig", results[0].path);
+    }
+
+    // Default (on): dispatcher.zig saturates the query → demoted below handler.zig.
+    _ = unsetenv("CODEDB_LEX_FREQ_PENALTY");
+    {
+        const results = try explorer.searchContent("evt", testing.allocator, 50);
+        defer {
+            for (results) |r| {
+                testing.allocator.free(r.path);
+                testing.allocator.free(r.line_text);
+            }
+            testing.allocator.free(results);
+        }
+        try testing.expect(results.len >= 2);
+        try testing.expectEqualStrings("src/handler.zig", results[0].path);
+    }
+}
+
+
 test "issue-430: Tier 0 markdown dominance starves canonical source file" {
     // Tier 0 of searchContent (explore.zig:1525-1554) iterates the word
     // index posting list in insertion order with a per-file cap of