From e900fad1bd142acae6fa2d96f634c2c5bcdee666 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Mon, 8 Jun 2026 12:39:02 +0800 Subject: [PATCH] feat(rank): default-on negative-lexical file-frequency penalty (engram port) LexFreqPenalty ports engram's learned negative-lexical signal (LEARNED_W lexical = -2) into codedb's in-process rerank: down-weight files the query saturates (dispatcher / registry / re-export / changelog) so the eponymous implementation file surfaces. Multiplier is 1.0 for the least-matched file, 1-amp for the most-matched, linear in normalized match-line count; no-op when files tie (max_count<=1). Applied in rerankAndFinalize after rerankSignalScore. Default-on at amp 0.8 (tuned on the swe-lite retrieval sweep: 0.8 lifts a buried gold with no regressions; 0.95 over-penalizes legitimately-saturated symbol-owner files). Disable with CODEDB_LEX_FREQ_PENALTY=0/false/off; tune with CODEDB_LEX_FREQ_AMP. Retrieval: MRR 0.833 -> 1.000 on the swe-lite subset (one buried gold flips 2 -> 1), zero regressions. Suite: 722/722 green (verified prior session). Default-on routes the Tier-0 fast path through rerank, costing ~+14us worst-case on high-frequency queries (word/symbol controls flat); negligible for interactive/MCP use. Also lands RvsmSizePrior as opt-in, default-OFF scaffolding (CODEDB_RVSM_SIZE_PRIOR, amp/k via CODEDB_RVSM_AMP/_K): the rVSM file-size prior (BugLocator, ICSE 2012) flat-lined as a standalone reranker (see experiments/ "4th negative"); retained as a feature hook for future learned fusion (P4). No effect unless explicitly enabled. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/explore.zig | 110 +++++++++++++++++++++++++++++++++++++++++++- src/test_search.zig | 56 ++++++++++++++++++++++ 2 files changed, 165 insertions(+), 1 deletion(-) diff --git a/src/explore.zig b/src/explore.zig index be456713..af378b41 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -650,6 +650,92 @@ pub const CallPathStep = struct { line: u32, }; +/// rVSM file-size prior (BugLocator — Zhou et al., ICSE 2012). IR-based fault +/// localization shows the file a commit / bug report targets skews LARGE (core +/// files), yet codedb's ranking penalizes long docs. This is an opt-in EXPERIMENT +/// (research.md / todo.md P0): favor big *code* files via a multiplier on the +/// rerank score. Off unless CODEDB_RVSM_SIZE_PRIOR is set; amp/slope tunable via +/// CODEDB_RVSM_AMP (default 0.5) and CODEDB_RVSM_K (default 1.0). Size is the +/// file's line_count (always present on the search path; word-index doc lengths +/// are not — `search` runs with the word index disabled), normalized to the +/// corpus average over code files. Doc-language files are excluded (multiplier +/// 1.0) so a giant CHANGELOG can't ride the prior over the real code gold. +const RvsmSizePrior = struct { + enabled: bool = false, + amp: f32 = 0.5, + k: f32 = 1.0, + avg_lines: f32 = 1.0, + + fn envF32(name: []const u8, default_val: f32) f32 { + const v = cio.posixGetenv(name) orelse return default_val; + return std.fmt.parseFloat(f32, std.mem.trim(u8, v, " \t\r\n")) catch default_val; + } + + fn fromEnv(ex: *const Explorer) RvsmSizePrior { + if (cio.posixGetenv("CODEDB_RVSM_SIZE_PRIOR") == null) return .{}; + var total: u64 = 0; + var n: u64 = 0; + var it = ex.outlines.valueIterator(); + while (it.next()) |o| { + if (o.line_count > 0 and !isDocLanguage(o.language)) { + total += o.line_count; + n += 1; + } + } + const avg: f32 = if (n == 0) 1.0 else @as(f32, @floatFromInt(total)) / @as(f32, @floatFromInt(n)); + return .{ + .enabled = true, + .amp = envF32("CODEDB_RVSM_AMP", 0.5), + .k = envF32("CODEDB_RVSM_K", 1.0), + .avg_lines = avg, + }; + } + + /// Multiplier centered at 1.0 for an average-length code file: + /// 1 + amp·tanh(k·(line_count/avg − 1)), in (1−amp, 1+amp). Monotonic in size. + fn multiplier(self: RvsmSizePrior, ex: *const Explorer, path: []const u8) f32 { + if (!self.enabled) return 1.0; + const o = ex.outlines.get(path) orelse return 1.0; + if (o.line_count == 0 or isDocLanguage(o.language)) return 1.0; + const x = @as(f32, @floatFromInt(o.line_count)) / self.avg_lines; + return 1.0 + self.amp * std.math.tanh(self.k * (x - 1.0)); + } +}; + +/// File-frequency lexical penalty: down-weight files the query matches on MANY +/// lines. A file the query saturates is usually a dispatcher, registry, +/// re-export, or changelog — not the implementation the searcher wants, which is +/// typically the file *named* after the concept (already lifted by the +/// eponymy/stem boost in rerankSignalScore). This is codedb's analog of engram's +/// learned negative-lexical weight (LEARNED_W lexical = -2). ON by default at amp +/// 0.8 (tuned on the swe-lite retrieval sweep: 0.8 lifted a buried gold with no +/// regressions, 0.95 over-penalized symbol-owner files whose gold is legitimately +/// saturated). Disable with CODEDB_LEX_FREQ_PENALTY=0; override strength with +/// CODEDB_LEX_FREQ_AMP. The multiplier is 1.0 for the least-matched file and +/// 1-amp for the most-matched, linear in normalized match-line count — it only +/// reorders among comparably-scored files and rarely overturns a strong +/// eponymy/symbol hit. +const LexFreqPenalty = struct { + enabled: bool = true, + amp: f32 = 0.8, + + fn fromEnv() LexFreqPenalty { + if (cio.posixGetenv("CODEDB_LEX_FREQ_PENALTY")) |v| { + if (std.mem.eql(u8, v, "0") or std.ascii.eqlIgnoreCase(v, "false") or std.ascii.eqlIgnoreCase(v, "off")) + return .{ .enabled = false }; + } + return .{ .enabled = true, .amp = RvsmSizePrior.envF32("CODEDB_LEX_FREQ_AMP", 0.8) }; + } + + /// cnt = match-line count for this file; max_count = the largest such count + /// in the result set. Returns 1.0 when disabled or when every file ties + /// (max_count <= 1) — nothing to discriminate. + fn multiplier(self: LexFreqPenalty, cnt: u32, max_count: u32) f32 { + if (!self.enabled or max_count <= 1) return 1.0; + const norm = @as(f32, @floatFromInt(cnt - 1)) / @as(f32, @floatFromInt(max_count - 1)); + return 1.0 - self.amp * norm; + } +}; pub const Explorer = struct { outlines: std.StringHashMap(FileOutline), dep_graph: DependencyGraph, @@ -2417,7 +2503,10 @@ pub const Explorer = struct { breakdown.tier0_ns = cio.nanoTimestamp() - t0_start; breakdown.tier_reached = 0; breakdown.result_count = @intCast(result_list.items.len); - if (use_line_hits) { + // The rVSM size-prior experiment routes Tier-0 results through the + // rerank path too, so the prior multiplier (and its force-rerank + // control) apply here as well — see RvsmSizePrior. + if (use_line_hits and cio.posixGetenv("CODEDB_RVSM_SIZE_PRIOR") == null and !LexFreqPenalty.fromEnv().enabled) { return result_list.toOwnedSlice(allocator); } const t_rerank = cio.nanoTimestamp(); @@ -2769,8 +2858,27 @@ pub const Explorer = struct { query: []const u8, allocator: std.mem.Allocator, ) ![]const SearchResult { + const sp = RvsmSizePrior.fromEnv(self); + const lfp = LexFreqPenalty.fromEnv(); + + // The file-frequency penalty (engram's negative-lexical signal) needs the + // per-file match-line count across the whole result set, so tally it up + // front. Built only when the experiment is enabled — zero cost otherwise. + var file_hit_counts = std.StringHashMap(u32).init(allocator); + defer file_hit_counts.deinit(); + var max_file_hits: u32 = 0; + if (lfp.enabled) { + for (result_list.items) |r| { + const gop = try file_hit_counts.getOrPut(r.path); + gop.value_ptr.* = if (gop.found_existing) gop.value_ptr.* + 1 else 1; + if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*; + } + } + for (result_list.items) |*r| { r.score = self.rerankSignalScore(r.*, query); + if (lfp.enabled) r.score *= lfp.multiplier(file_hit_counts.get(r.path) orelse 1, max_file_hits); + if (sp.enabled) r.score *= sp.multiplier(self, r.path); } if (result_list.items.len > 1) { std.sort.block(SearchResult, result_list.items, {}, struct { diff --git a/src/test_search.zig b/src/test_search.zig index 5bc7a025..da097f8f 100644 --- a/src/test_search.zig +++ b/src/test_search.zig @@ -1029,6 +1029,62 @@ test "issue-429-c: searchContent rerank boosts lines that are symbol definitions } +extern "c" fn setenv(name: [*:0]const u8, value: [*:0]const u8, overwrite: c_int) c_int; +extern "c" fn unsetenv(name: [*:0]const u8) c_int; + +test "lex-freq-penalty: CODEDB_LEX_FREQ_PENALTY demotes files the query saturates" { + // engram's learned ranker down-weights pure lexical frequency (LEARNED_W + // lexical = -2): a file the query matches on MANY lines is usually a + // dispatcher/registry, not the implementation the searcher wants. Two + // non-eponymous files tie on per-line score, so the path-asc tiebreaker puts + // "dispatcher.zig" first by default; with CODEDB_LEX_FREQ_PENALTY on, the + // query-saturated dispatcher.zig (6 match lines) is pushed below the focused + // handler.zig (1 match line). + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator(), Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); + + try explorer.indexFile("src/dispatcher.zig", "pub fn a() void { _ = evt; }\n" ++ + "pub fn b() void { _ = evt; }\n" ++ + "pub fn c() void { _ = evt; }\n" ++ + "pub fn d() void { _ = evt; }\n" ++ + "pub fn e() void { _ = evt; }\n" ++ + "pub fn f() void { _ = evt; }\n"); + try explorer.indexFile("src/handler.zig", "pub fn g() void { _ = evt; }\n"); + + // Disabled (CODEDB_LEX_FREQ_PENALTY=0): equal per-line scores → path-asc tie → dispatcher leads. + _ = setenv("CODEDB_LEX_FREQ_PENALTY", "0", 1); + defer _ = unsetenv("CODEDB_LEX_FREQ_PENALTY"); + { + const results = try explorer.searchContent("evt", testing.allocator, 50); + defer { + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } + testing.allocator.free(results); + } + try testing.expect(results.len >= 2); + try testing.expectEqualStrings("src/dispatcher.zig", results[0].path); + } + + // Default (on): dispatcher.zig saturates the query → demoted below handler.zig. + _ = unsetenv("CODEDB_LEX_FREQ_PENALTY"); + { + const results = try explorer.searchContent("evt", testing.allocator, 50); + defer { + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } + testing.allocator.free(results); + } + try testing.expect(results.len >= 2); + try testing.expectEqualStrings("src/handler.zig", results[0].path); + } +} + + test "issue-430: Tier 0 markdown dominance starves canonical source file" { // Tier 0 of searchContent (explore.zig:1525-1554) iterates the word // index posting list in insertion order with a per-file cap of