diff --git a/deploy/copilotkit-docs.yaml b/deploy/copilotkit-docs.yaml index 4162ec3..628d3d9 100644 --- a/deploy/copilotkit-docs.yaml +++ b/deploy/copilotkit-docs.yaml @@ -14,10 +14,10 @@ sources: - name: docs type: markdown repo: https://github.com/CopilotKit/CopilotKit.git - path: docs/content/docs/ + path: showcase/shell-docs/src/content/docs/ base_url: https://docs.copilotkit.ai/ url_derivation: - strip_prefix: "docs/content/docs/" + strip_prefix: "showcase/shell-docs/src/content/docs/" strip_suffix: ".mdx" strip_route_groups: true strip_index: true @@ -45,6 +45,12 @@ sources: - "**/__tests__/**" - "**/*.test.*" - "**/*.spec.*" + # Demo/example apps and the docs site Next.js build are not library code + - "examples/**" + - "showcase/**" + - "**/.next/**" + # Generated type declarations duplicate the .ts sources they describe + - "**/*.d.ts" skip_dirs: - node_modules - dist @@ -66,8 +72,10 @@ sources: strip_route_groups: true strip_index: true file_patterns: + # .mdx only — the docs/ tree's two .md files (README.md, ag_ui.md) are a + # folder README and a raw spec, not rendered product pages, and keep their + # extension under a .mdx-only strip_suffix, deriving 404 URLs. - "**/*.mdx" - - "**/*.md" chunk: target_tokens: 600 overlap_tokens: 50 @@ -75,18 +83,40 @@ sources: - name: ag-ui-code type: code repo: https://github.com/ag-ui-protocol/ag-ui.git - path: sdks/typescript/packages/ + # Walk from the repo root so the TS SDK, the Python adapters under + # integrations/, the Python SDK, and the JVM/multi-language community SDK + # are all reachable. file_patterns (matched against repo-root-relative + # paths) scope which subtrees actually get indexed. + path: "." file_patterns: - - "**/*.ts" - - "**/*.tsx" - - "**/*.js" - - "**/*.jsx" + # TypeScript SDK packages + - "sdks/typescript/packages/**/*.ts" + - "sdks/typescript/packages/**/*.tsx" + - "sdks/typescript/packages/**/*.js" + - "sdks/typescript/packages/**/*.jsx" + # Framework integrations (Python adapters + TS bindings) + - "integrations/**/*.py" + - "integrations/**/*.ts" + # Python SDK + - "sdks/python/**/*.py" + # Community SDK — JVM (Kotlin/Java) plus the other native ports that + # live under sdks/community/ (go, rust, dart, ruby) + - "sdks/community/**/*.kt" + - "sdks/community/**/*.java" + - "sdks/community/**/*.go" + - "sdks/community/**/*.rs" + - "sdks/community/**/*.dart" + - "sdks/community/**/*.rb" exclude_patterns: - "**/test/**" - "**/tests/**" - "**/__tests__/**" - "**/*.test.*" - "**/*.spec.*" + # Codegen output — protobuf codec + generated stubs dominate and add no + # hand-written signal + - "**/generated/**" + - "**/*.pb.*" skip_dirs: - node_modules - dist @@ -100,35 +130,43 @@ sources: tools: - name: search-docs type: search - description: "Search the server's documentation to retrieve relevant information. This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." + description: "Search the CopilotKit product documentation (https://docs.copilotkit.ai) — guides, concepts, quickstarts, API reference, and how-tos for building with CopilotKit. Use this for CopilotKit usage and configuration questions. NOT for AG-UI protocol docs (use search-ag-ui-docs) and NOT for source code (use search-code). This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." source: docs default_limit: 5 max_limit: 20 result_format: docs + search_mode: hybrid + min_score: 0.3 - name: search-code type: search - description: "Search the server's indexed codebase to find relevant code snippets and implementations. Use this tool when you need to understand how something is implemented, find code examples, or locate specific functionality in the codebase. This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." + description: "Search the CopilotKit/CopilotKit source code (React UI, hooks, runtime, CSS, framework adapters) to understand how something is implemented or find code examples. Covers library/package source only — NOT example or showcase apps, NOT generated .d.ts type declarations, and NOT the AG-UI protocol SDK (use search-ag-ui-code for that). This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." source: code default_limit: 10 max_limit: 20 result_format: code + search_mode: hybrid + min_score: 0.3 - name: search-ag-ui-docs type: search - description: "Search the AG-UI protocol documentation to retrieve relevant information about the Agent-User Interaction protocol, events, types, and integration guides. This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." + description: "Search the AG-UI protocol documentation (https://docs.ag-ui.com) — the Agent-User Interaction protocol spec, event types, message schemas, and framework integration guides. Use this for protocol-level questions about AG-UI. NOT for CopilotKit product docs (use search-docs) and NOT for source code (use search-ag-ui-code). This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." source: ag-ui-docs default_limit: 5 max_limit: 20 result_format: docs + search_mode: hybrid + min_score: 0.3 - name: search-ag-ui-code type: search - description: "Search the AG-UI TypeScript SDK codebase to find relevant code snippets and implementations. Use this tool to understand AG-UI protocol types, events, encoders, and client implementations. This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." + description: "Search the AG-UI protocol SDK source code across languages — TypeScript SDK packages, the Python SDK and framework integration adapters (LangGraph, CrewAI, etc.), and the JVM/multi-language community SDK. Use this to understand AG-UI protocol types, events, encoders, and client/adapter implementations. This is AG-UI SDK code only — NOT CopilotKit React/CSS/hooks (use search-code) and NOT prose docs (use search-ag-ui-docs). This is a semantic search, so prefer performing multiple queries with different phrases instead of a single long query, until you find all the context you need." source: ag-ui-code default_limit: 10 max_limit: 20 result_format: code + search_mode: hybrid + min_score: 0.3 - name: explore-docs type: bash @@ -194,9 +232,12 @@ webhook: - ag-ui-code path_triggers: docs: - - "docs/" + - "showcase/shell-docs/src/content/" code: [] ag-ui-docs: - "docs/" ag-ui-code: - "sdks/typescript/packages/" + - "integrations/" + - "sdks/python/" + - "sdks/community/" diff --git a/docs/analytics.html b/docs/analytics.html index eb10e2c..ce460dc 100644 --- a/docs/analytics.html +++ b/docs/analytics.html @@ -741,8 +741,11 @@

Analytics Token

})(); // --- Constants / helpers --- - // Upper bound used by the "All time" preset. Server-side MAX_DAYS must - // stay >= this value. + // Sentinel "days" value the "All time" preset sends. MUST match + // ALL_TIME_DAYS in src/db/analytics.ts (the canonical source) — the DB + // layer treats a window >= this value as "no lower time bound" so the + // totals span every row. Server-side MAX_DAYS stays strictly above this + // value so the preset is admitted rather than 400'd. var ALL_TIME_DAYS = 99999; // Dedicated error so load()'s catch block can distinguish auth failures diff --git a/scripts/seed-analytics.ts b/scripts/seed-analytics.ts index bb6ea0b..72f7c50 100644 --- a/scripts/seed-analytics.ts +++ b/scripts/seed-analytics.ts @@ -97,9 +97,20 @@ interface SeedRow { latency_ms: number; source_name: string; session_id: string | null; + request_source: string | null; created_at: Date; } +// Request-origin mix: mostly real users, a slice of synthetic/analysis traffic +// plus some untagged (null) rows to mimic historical data predating the column. +function pickRequestSource(): string | null { + const r = Math.random(); + if (r < 0.7) return "user"; + if (r < 0.82) return "synthetic"; + if (r < 0.9) return "analysis"; + return null; // untagged historical row +} + function generateRow(): SeedRow { const isEmptyResult = Math.random() < 0.15; // ~15% empty const resultCount = isEmptyResult ? 0 : randomInt(1, 20); @@ -115,6 +126,7 @@ function generateRow(): SeedRow { latency_ms: randomInt(50, 500), source_name: pick(SOURCE_NAMES), session_id: Math.random() < 0.6 ? `sess_${randomInt(1000, 9999)}` : null, + request_source: pickRequestSource(), created_at: randomTimestamp(), }; } @@ -139,8 +151,8 @@ async function main() { for (const row of rows) { await pool.query( - `INSERT INTO query_log (tool_name, query_text, result_count, top_score, latency_ms, source_name, session_id, created_at) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + `INSERT INTO query_log (tool_name, query_text, result_count, top_score, latency_ms, source_name, session_id, request_source, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, [ row.tool_name, row.query_text, @@ -149,6 +161,7 @@ async function main() { row.latency_ms, row.source_name, row.session_id, + row.request_source, row.created_at, ], ); diff --git a/scripts/test-path-filter.ts b/scripts/test-path-filter.ts index d33832c..46da9ab 100644 --- a/scripts/test-path-filter.ts +++ b/scripts/test-path-filter.ts @@ -37,7 +37,11 @@ function makeFileSourceConfig( }; } -// Code source config — matches the real pathfinder.yaml code source patterns +// Code source config — mirrors the real deploy/copilotkit-docs.yaml `code` +// source patterns (file_patterns + exclude_patterns). Keep in sync with that +// file: the excludes drop the v1 packages, every flavor of test file, the +// demo/example and showcase apps, the Next.js build output, and generated +// .d.ts declarations. const codeConfig = makeFileSourceConfig( ['**/*.ts', '**/*.tsx', '**/*.js', '**/*.jsx', '**/*.py'], [ @@ -48,10 +52,16 @@ const codeConfig = makeFileSourceConfig( '**/__tests__/**', '**/*.test.*', '**/*.spec.*', + 'examples/**', + 'showcase/**', + '**/.next/**', + '**/*.d.ts', ], ); -// Docs source config — only *.mdx, no excludes +// Docs source config — mirrors the real `docs` source: only *.mdx, no excludes. +// (The source path is showcase/shell-docs/src/content/docs/, so derived paths +// live under that tree.) const docsConfig = makeFileSourceConfig(['**/*.mdx']); console.log('=== Path Filter Tests ===\n'); @@ -79,17 +89,27 @@ assert(!matchesPatterns('packages/v2/runtime/src/runtime.spec.ts', codeConfig), assert(!matchesPatterns('packages/v2/runtime/src/runtime.test.tsx', codeConfig), '*.test.tsx excluded'); assert(!matchesPatterns('examples/with-agno/tests/test_agent.py', codeConfig), 'Python test dir excluded'); +// --- demo/example, showcase, build output, and generated decls excluded --- +console.log('\n--- examples/showcase/.next/.d.ts exclusion ---'); +assert(!matchesPatterns('examples/with-agno/src/agent.py', codeConfig), 'examples/ src excluded'); +assert(!matchesPatterns('examples/coagents-research-canvas/ui/src/page.tsx', codeConfig), 'examples/ deep path excluded'); +assert(!matchesPatterns('showcase/shell-docs/src/lib/util.ts', codeConfig), 'showcase/ excluded'); +assert(!matchesPatterns('packages/v2/react/.next/server/app/page.js', codeConfig), '**/.next/** build output excluded'); +assert(!matchesPatterns('packages/v2/runtime/dist/index.d.ts', codeConfig), '**/*.d.ts generated decls excluded'); +assert(!matchesPatterns('packages/v2/runtime/src/types.d.ts', codeConfig), '.d.ts excluded even alongside sources'); + // --- non-test files included --- console.log('\n--- non-test files included ---'); assert(matchesPatterns('packages/v2/runtime/src/runtime.ts', codeConfig), 'normal src file included'); -assert(matchesPatterns('examples/with-agno/src/agent.py', codeConfig), 'example src file included'); +assert(matchesPatterns('packages/shared/src/utils.ts', codeConfig), 'library package src included'); assert(matchesPatterns('src/index.ts', codeConfig), 'root src file included'); // --- docs (*.mdx only) --- +// Paths reflect the current docs source tree: showcase/shell-docs/src/content/docs/ console.log('\n--- docs (*.mdx only) ---'); -assert(matchesPatterns('docs/content/docs/(root)/quickstart.mdx', docsConfig), 'docs included'); -assert(matchesPatterns('docs/content/docs/reference/v1/hooks.mdx', docsConfig), 'v1 docs included (no code excludes on docs)'); -assert(!matchesPatterns('docs/content/docs/reference/v1/hooks.ts', docsConfig), 'ts file not matched by docs config'); +assert(matchesPatterns('showcase/shell-docs/src/content/docs/(root)/quickstart.mdx', docsConfig), 'docs included'); +assert(matchesPatterns('showcase/shell-docs/src/content/docs/reference/hooks.mdx', docsConfig), 'nested docs included (no code excludes on docs)'); +assert(!matchesPatterns('showcase/shell-docs/src/content/docs/reference/hooks.ts', docsConfig), 'ts file not matched by docs config'); // --- edge cases --- console.log('\n--- edge cases ---'); diff --git a/src/__tests__/analytics-endpoints.test.ts b/src/__tests__/analytics-endpoints.test.ts index e84775c..9f4e793 100644 --- a/src/__tests__/analytics-endpoints.test.ts +++ b/src/__tests__/analytics-endpoints.test.ts @@ -9,12 +9,22 @@ const mockGetTopQueries = vi.fn(); const mockGetEmptyQueries = vi.fn(); const mockGetToolCounts = vi.fn(); -vi.mock("../db/analytics.js", () => ({ - getAnalyticsSummary: (...args: unknown[]) => mockGetAnalyticsSummary(...args), - getTopQueries: (...args: unknown[]) => mockGetTopQueries(...args), - getEmptyQueries: (...args: unknown[]) => mockGetEmptyQueries(...args), - getToolCounts: (...args: unknown[]) => mockGetToolCounts(...args), -})); +// Pure request-source helpers are imported by server.ts (parseAnalyticsFilter +// + requestSourceFromHeaders). They carry no DB dependency, so the mock +// re-exports the real implementations via importOriginal rather than stubbing +// them — otherwise server.ts blows up on `REQUEST_SOURCE_VALUES` being +// undefined. The reader functions (getAnalyticsSummary etc.) stay mocked. +vi.mock("../db/analytics.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + getAnalyticsSummary: (...args: unknown[]) => + mockGetAnalyticsSummary(...args), + getTopQueries: (...args: unknown[]) => mockGetTopQueries(...args), + getEmptyQueries: (...args: unknown[]) => mockGetEmptyQueries(...args), + getToolCounts: (...args: unknown[]) => mockGetToolCounts(...args), + }; +}); // Mock config — analyticsAuth now uses getAnalyticsConfig vi.mock("../config.js", () => ({ @@ -50,6 +60,7 @@ import { getAnalyticsConfig, getConfig } from "../config.js"; import { analyticsAuth, parseAnalyticsFilter, + requestSourceFromHeaders, __resetAnalyticsTokenForTesting, MAX_DAYS, } from "../server.js"; @@ -520,6 +531,49 @@ describe("analyticsAuth middleware", () => { }); }); +// --------------------------------------------------------------------------- +// requestSourceFromHeaders — X-Pathfinder-Source capture at MCP init +// --------------------------------------------------------------------------- + +describe("requestSourceFromHeaders", () => { + function mkReq(headers: Record): Request { + return { headers } as unknown as Request; + } + + it("reads the X-Pathfinder-Source header (Express lower-cases it)", () => { + expect( + requestSourceFromHeaders(mkReq({ "x-pathfinder-source": "synthetic" })), + ).toBe("synthetic"); + expect( + requestSourceFromHeaders(mkReq({ "x-pathfinder-source": "analysis" })), + ).toBe("analysis"); + }); + + it("normalizes case/whitespace", () => { + expect( + requestSourceFromHeaders(mkReq({ "x-pathfinder-source": " User " })), + ).toBe("user"); + }); + + it("defaults to 'user' when the header is absent", () => { + expect(requestSourceFromHeaders(mkReq({}))).toBe("user"); + }); + + it("defaults to 'user' for an unrecognized value", () => { + expect( + requestSourceFromHeaders(mkReq({ "x-pathfinder-source": "crawler" })), + ).toBe("user"); + }); + + it("handles an array-shaped header value defensively (takes first)", () => { + expect( + requestSourceFromHeaders( + mkReq({ "x-pathfinder-source": ["analysis", "user"] }), + ), + ).toBe("analysis"); + }); +}); + // --------------------------------------------------------------------------- // parseAnalyticsFilter — from/to validation // --------------------------------------------------------------------------- @@ -742,6 +796,7 @@ describe("parseAnalyticsFilter from/to validation", () => { ["limit", { limit: ["10", "20"] }], ["tool_type", { tool_type: ["search", "collect"] }], ["source", { source: ["docs", "api"] }], + ["request_source", { request_source: ["user", "synthetic"] }], ])( "rejects array query param `%s` (Express multi-value) with 400", (_name, query) => { @@ -754,6 +809,53 @@ describe("parseAnalyticsFilter from/to validation", () => { }, ); + // --------------------------------------------------------------------------- + // request_source audience parameter + // --------------------------------------------------------------------------- + + it("omits request_source from the filter when the param is absent (default = real users downstream)", () => { + const result = parseAnalyticsFilter(mkReq({}) as Request); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.filter.request_source).toBeUndefined(); + } + }); + + it.each(["user", "synthetic", "analysis", "all"])( + "accepts request_source=%s", + (value) => { + const result = parseAnalyticsFilter( + mkReq({ request_source: value }) as Request, + ); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.filter.request_source).toBe(value); + } + }, + ); + + it("rejects an unknown request_source with 400 rather than silently defaulting", () => { + const result = parseAnalyticsFilter( + mkReq({ request_source: "robot" }) as Request, + ); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.status).toBe(400); + expect(result.body.error).toBe("invalid_request"); + expect(result.body.error_description).toMatch(/request_source/); + } + }); + + it("rejects empty request_source with 400", () => { + const result = parseAnalyticsFilter( + mkReq({ request_source: "" }) as Request, + ); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.status).toBe(400); + } + }); + // --------------------------------------------------------------------------- // Range-width cap. The server caps from/to span at MAX_DAYS so a client // can't request `from=1970-01-01&to=9999-12-31` and force a scan across diff --git a/src/__tests__/analytics-observability.test.ts b/src/__tests__/analytics-observability.test.ts new file mode 100644 index 0000000..fccf130 --- /dev/null +++ b/src/__tests__/analytics-observability.test.ts @@ -0,0 +1,442 @@ +import { describe, it, expect, beforeAll, afterAll, beforeEach } from "vitest"; +import { PGlite } from "@electric-sql/pglite"; +import { __setPoolForTesting, __resetPoolForTesting } from "../db/client.js"; +import { + getAnalyticsSummary, + getToolCounts, + getEmptyQueries, + logQuery, + ALL_TIME_DAYS, + ROLLING_WINDOW_CAP_DAYS, + BROWSE_QUERY_TEXT, +} from "../db/analytics.js"; +import { generatePostSchemaMigration } from "../db/schema.js"; + +// ----------------------------------------------------------------------------- +// Integration tests for the observability primitives (request_source + +// low-confidence) against a real in-process PGlite instance. +// +// Mock-pool unit tests pin the generated SQL strings + bound params, but they +// can't catch an off-by-one in the `$N` placeholder numbering — only running +// the SQL end-to-end does. These tests seed query_log with a known mix of +// origins and scores, then assert getAnalyticsSummary / getToolCounts return +// the right COUNTS, exercising every windowed subquery (which is where the +// request-source clause is spliced between the date-window and redacted +// params). +// +// Schema source of truth: generatePostSchemaMigration() — we slice out just +// the query_log section so we don't need the pgvector extension. +// ----------------------------------------------------------------------------- + +const QUERY_LOG_DDL_MARKER = + "-- Analytics: query_log table for tracking tool usage"; + +function extractQueryLogDdl(): string { + const full = generatePostSchemaMigration(); + const idx = full.indexOf(QUERY_LOG_DDL_MARKER); + if (idx < 0) { + throw new Error( + `Could not locate "${QUERY_LOG_DDL_MARKER}" in generatePostSchemaMigration(); ` + + `schema.ts may have been refactored — update the marker.`, + ); + } + return full.slice(idx); +} + +function poolFromPglite(db: PGlite) { + return { + query: (text: string, params?: unknown[]) => db.query(text, params), + connect: async () => ({ + query: (text: string, params?: unknown[]) => db.query(text, params), + release: () => {}, + }), + end: async () => db.close(), + }; +} + +/** A timestamp inside today's UTC day (noon, avoids TZ edge ambiguity). */ +function nowNoonUtc(): Date { + const day = new Date().toISOString().slice(0, 10); + return new Date(`${day}T12:00:00.000Z`); +} + +interface SeedOpts { + request_source?: string | null; + top_score?: number | null; + result_count?: number; + query_text?: string; +} + +async function seed(db: PGlite, count: number, opts: SeedOpts = {}) { + const createdAt = nowNoonUtc(); + for (let i = 0; i < count; i++) { + await db.query( + `INSERT INTO query_log + (tool_name, query_text, result_count, top_score, latency_ms, + source_name, session_id, request_source, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + [ + "search-docs", + opts.query_text ?? "q", + opts.result_count ?? 5, + opts.top_score === undefined ? 0.9 : opts.top_score, + 42, + "docs", + "sess-1", + opts.request_source === undefined ? "user" : opts.request_source, + createdAt, + ], + ); + } +} + +describe("observability: request_source + low-confidence (PGlite integration)", () => { + let db: PGlite; + + beforeAll(async () => { + db = new PGlite(); + await db.waitReady; + await db.exec(extractQueryLogDdl()); + __setPoolForTesting(poolFromPglite(db)); + }); + + afterAll(async () => { + __resetPoolForTesting(); + await db.close(); + }); + + beforeEach(async () => { + await db.query("DELETE FROM query_log"); + }); + + // --------------------------------------------------------------------------- + // request_source default = real users (user + NULL), synthetic/analysis out + // --------------------------------------------------------------------------- + + it("default summary counts real users (user) AND untagged (NULL), excludes synthetic/analysis", async () => { + await seed(db, 10, { request_source: "user" }); + await seed(db, 4, { request_source: null }); // historical, untagged + await seed(db, 7, { request_source: "synthetic" }); + await seed(db, 3, { request_source: "analysis" }); + + const result = await getAnalyticsSummary({}, 7); + + // user (10) + null (4) = 14; synthetic/analysis excluded. + expect(result.total_queries_window).toBe(14); + // total_queries (all-time, all origins) counts everything. + expect(result.total_queries).toBe(24); + }); + + it("request_source: 'all' counts every origin in the window", async () => { + await seed(db, 10, { request_source: "user" }); + await seed(db, 4, { request_source: null }); + await seed(db, 7, { request_source: "synthetic" }); + await seed(db, 3, { request_source: "analysis" }); + + const result = await getAnalyticsSummary({ request_source: "all" }, 7); + + expect(result.total_queries_window).toBe(24); + }); + + it("request_source: 'synthetic' counts only synthetic rows (NULL excluded)", async () => { + await seed(db, 10, { request_source: "user" }); + await seed(db, 4, { request_source: null }); + await seed(db, 7, { request_source: "synthetic" }); + + const result = await getAnalyticsSummary( + { request_source: "synthetic" }, + 7, + ); + + expect(result.total_queries_window).toBe(7); + }); + + it("request_source: 'user' explicitly still folds in NULL rows (real users)", async () => { + await seed(db, 10, { request_source: "user" }); + await seed(db, 4, { request_source: null }); + await seed(db, 7, { request_source: "synthetic" }); + + const result = await getAnalyticsSummary({ request_source: "user" }, 7); + + expect(result.total_queries_window).toBe(14); + }); + + it("getToolCounts honors the real-users default and the all-sources view", async () => { + await seed(db, 6, { request_source: "user" }); + await seed(db, 2, { request_source: null }); + await seed(db, 5, { request_source: "synthetic" }); + + const defaultCounts = await getToolCounts(7); + const totalDefault = defaultCounts.reduce((a, c) => a + c.count, 0); + expect(totalDefault).toBe(8); // user + null + + const allCounts = await getToolCounts(7, { request_source: "all" }); + const totalAll = allCounts.reduce((a, c) => a + c.count, 0); + expect(totalAll).toBe(13); + }); + + // --------------------------------------------------------------------------- + // low-confidence: result_count > 0 AND top_score < 0.5 (NULL excluded) + // --------------------------------------------------------------------------- + + it("counts low-confidence rows (result_count>0 AND top_score<0.5), excludes NULL-score and empty", async () => { + // 5 strong hits (0.9), 3 low-confidence (0.3), 2 empty (0 results, null + // score), 1 borderline (exactly 0.5 — NOT low because predicate is < 0.5). + await seed(db, 5, { top_score: 0.9, result_count: 5 }); + await seed(db, 3, { top_score: 0.3, result_count: 4 }); + await seed(db, 2, { top_score: null, result_count: 0 }); + await seed(db, 1, { top_score: 0.5, result_count: 4 }); + + const result = await getAnalyticsSummary({}, 7); + + // Only the 3 rows scoring 0.3 are low-confidence. + expect(result.low_confidence_count_window).toBe(3); + // 11 rows total in the window, 3 low-confidence. + expect(result.total_queries_window).toBe(11); + expect(result.low_confidence_rate_window).toBeCloseTo(3 / 11); + }); + + it("low-confidence respects the request-source default (synthetic low-conf excluded)", async () => { + await seed(db, 2, { + top_score: 0.3, + result_count: 4, + request_source: "user", + }); + await seed(db, 5, { + top_score: 0.3, + result_count: 4, + request_source: "synthetic", + }); + + const result = await getAnalyticsSummary({}, 7); + + // Only the 2 real-user low-confidence rows count by default. + expect(result.low_confidence_count_window).toBe(2); + }); + + it("a NULL top_score with results is NOT low-confidence (absence of score != low score)", async () => { + await seed(db, 4, { top_score: null, result_count: 5 }); + + const result = await getAnalyticsSummary({}, 7); + + expect(result.low_confidence_count_window).toBe(0); + }); + + // --------------------------------------------------------------------------- + // logQuery → readers round-trip (writer persists request_source end-to-end) + // --------------------------------------------------------------------------- + + it("logQuery persists request_source so the readers can filter on it", async () => { + await logQuery({ + tool_name: "search-docs", + query_text: "q", + result_count: 5, + top_score: 0.9, + latency_ms: 10, + source_name: "docs", + session_id: "live-1", + request_source: "analysis", + }); + await logQuery({ + tool_name: "search-docs", + query_text: "q", + result_count: 5, + top_score: 0.9, + latency_ms: 10, + source_name: "docs", + session_id: "live-2", + request_source: "user", + }); + + // Default view sees only the real user row. + const def = await getAnalyticsSummary({}, 7); + expect(def.total_queries_window).toBe(1); + + // Analysis view sees only the analysis row. + const analysis = await getAnalyticsSummary( + { request_source: "analysis" }, + 7, + ); + expect(analysis.total_queries_window).toBe(1); + + // Confirm the writer actually stored session_id (no longer hardcoded null) + // and the request_source column. + const { rows } = await db.query<{ + session_id: string | null; + request_source: string | null; + }>("SELECT session_id, request_source FROM query_log ORDER BY session_id"); + expect(rows.map((r) => r.session_id)).toEqual(["live-1", "live-2"]); + expect(rows.map((r) => r.request_source)).toEqual(["analysis", "user"]); + }); +}); + +// --------------------------------------------------------------------------- +// All-time window + range-mode per-day cap + browse-sentinel exclusion + +// summary numeric coercion (PGlite integration). +// +// These exercise the dashboard-consistency fixes that need real SQL against a +// dataset spanning > ROLLING_WINDOW_CAP_DAYS — a mock pool can't validate that +// the lower-bound clause is genuinely omitted for the all-time sentinel. +// --------------------------------------------------------------------------- + +/** ISO "YYYY-MM-DD" for a UTC day offset from today (negative = past). */ +function utcDayStringOffset(offsetDays: number): string { + const d = new Date(); + d.setUTCDate(d.getUTCDate() + offsetDays); + return d.toISOString().slice(0, 10); +} + +/** Noon UTC of the given day offset — avoids any TZ edge ambiguity. */ +function utcNoonOfOffset(offsetDays: number): Date { + return new Date(`${utcDayStringOffset(offsetDays)}T12:00:00.000Z`); +} + +async function seedAt( + db: PGlite, + createdAt: Date, + opts: SeedOpts = {}, +): Promise { + await db.query( + `INSERT INTO query_log + (tool_name, query_text, result_count, top_score, latency_ms, + source_name, session_id, request_source, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + [ + "search-docs", + opts.query_text ?? "q", + opts.result_count ?? 5, + opts.top_score === undefined ? 0.9 : opts.top_score, + 42, + "docs", + "sess-1", + opts.request_source === undefined ? "user" : opts.request_source, + createdAt, + ], + ); +} + +describe("all-time window + dashboard consistency (PGlite integration)", () => { + let db: PGlite; + + beforeAll(async () => { + db = new PGlite(); + await db.waitReady; + await db.exec(extractQueryLogDdl()); + __setPoolForTesting(poolFromPglite(db)); + }); + + afterAll(async () => { + __resetPoolForTesting(); + await db.close(); + }); + + beforeEach(async () => { + await db.query("DELETE FROM query_log"); + }); + + // -- Fix #1: "All time" truly spans all data (no 366-day clamp) ------------ + + it("all-time (days=ALL_TIME_DAYS) totals cover ALL rows, not just the 366-day cap", async () => { + // Three rows: today, ~200 days back (inside the cap), ~700 days back + // (WELL outside the 366-day rolling cap). Pre-fix, the LEAST(N,366) + // clamp dropped the 700-day row from every windowed card even though + // the user explicitly asked for "all time". + await seedAt(db, utcNoonOfOffset(0)); + await seedAt(db, utcNoonOfOffset(-200)); + await seedAt(db, utcNoonOfOffset(-700)); + + const result = await getAnalyticsSummary({}, ALL_TIME_DAYS); + + // All three rows are inside an unbounded "all time" window. + expect(result.total_queries_window).toBe(3); + // Sanity: the all-time total equals the windowed total here (single source). + expect(result.total_queries).toBe(3); + }); + + it("all-time empty-result rate is computed over ALL rows (>366 days)", async () => { + // 1 hit + 1 empty inside the cap, 1 empty far outside it. Pre-fix the + // far-outside empty was invisible, skewing the rate denominator. + await seedAt(db, utcNoonOfOffset(0), { result_count: 5 }); + await seedAt(db, utcNoonOfOffset(-100), { result_count: 0 }); + await seedAt(db, utcNoonOfOffset(-700), { result_count: 0 }); + + const result = await getAnalyticsSummary({}, ALL_TIME_DAYS); + + expect(result.total_queries_window).toBe(3); + expect(result.empty_result_count_window).toBe(2); + expect(result.empty_result_rate_window).toBeCloseTo(2 / 3); + }); + + it("a sub-all-time window (days=1000) still clamps to the 366-day cap", async () => { + // Regression guard: only the all-time sentinel uncaps. A large-but-finite + // window must still clamp so payload/score behavior is unchanged. + await seedAt(db, utcNoonOfOffset(-200)); // inside cap + await seedAt(db, utcNoonOfOffset(-700)); // outside cap + + const result = await getAnalyticsSummary({}, 1000); + + expect(result.total_queries_window).toBe(1); + }); + + // -- Fix #2: range-mode per-day series is capped -------------------------- + + it("range-mode per-day series is capped at ROLLING_WINDOW_CAP_DAYS (no payload bloat)", async () => { + // A range far wider than the cap must not emit one bar per day for the + // whole span. Pre-fix this produced (to-from) daily rows uncapped. + const from = utcNoonOfOffset(-3000); + const to = utcNoonOfOffset(0); + await seedAt(db, utcNoonOfOffset(-1)); + + const result = await getAnalyticsSummary({ from, to }, 7); + + expect(result.queries_per_day_window.length).toBeLessThanOrEqual( + ROLLING_WINDOW_CAP_DAYS, + ); + }); + + it("range-mode narrow span still emits one bar per day (cap doesn't shrink small ranges)", async () => { + const from = utcNoonOfOffset(-4); + const to = utcNoonOfOffset(0); // 5-day inclusive span + await seedAt(db, utcNoonOfOffset(-2)); + + const result = await getAnalyticsSummary({ from, to }, 7); + + expect(result.queries_per_day_window).toHaveLength(5); + }); + + // -- Fix #3: sentinel excluded from empty-queries ---------------- + + it("getEmptyQueries excludes the sentinel", async () => { + // A browse call that returned zero FAQ entries logs query_text="" + // with result_count=0. It must NOT surface as a literal "" row in + // the empty-result dashboard. + await seedAt(db, utcNoonOfOffset(0), { + query_text: BROWSE_QUERY_TEXT, + result_count: 0, + }); + await seedAt(db, utcNoonOfOffset(0), { + query_text: "real empty query", + result_count: 0, + }); + + const rows = await getEmptyQueries(7, 50); + const texts = rows.map((r) => r.query_text); + + expect(texts).not.toContain(BROWSE_QUERY_TEXT); + expect(texts).toContain("real empty query"); + }); + + // -- Fix #4: summary numeric fields are coerced (no NaN) ------------------ + + it("summary numeric rates are finite numbers (driver-typing coercion)", async () => { + await seedAt(db, utcNoonOfOffset(0), { result_count: 0 }); + await seedAt(db, utcNoonOfOffset(0), { top_score: 0.3, result_count: 4 }); + + const result = await getAnalyticsSummary({}, 7); + + expect(Number.isFinite(result.empty_result_rate_window)).toBe(true); + expect(Number.isFinite(result.low_confidence_rate_window)).toBe(true); + expect(Number.isFinite(result.avg_latency_ms_window)).toBe(true); + expect(Number.isFinite(result.total_queries_window)).toBe(true); + }); +}); diff --git a/src/__tests__/analytics.test.ts b/src/__tests__/analytics.test.ts index f1ba6cb..7e3e34b 100644 --- a/src/__tests__/analytics.test.ts +++ b/src/__tests__/analytics.test.ts @@ -15,6 +15,13 @@ import { cleanupOldQueryLogs, REDACTED_QUERY_TEXT, P95_LATENCY_ROW_CAP, + LOW_CONFIDENCE_SCORE_THRESHOLD, + normalizeRequestSource, + DEFAULT_REQUEST_SOURCE, + REQUEST_SOURCE_VALUES, + ALL_TIME_DAYS, + ROLLING_WINDOW_CAP_DAYS, + BROWSE_QUERY_TEXT, } from "../db/analytics.js"; import type { QueryLogEntry } from "../db/analytics.js"; @@ -39,6 +46,7 @@ describe("logQuery", () => { latency_ms: 42, source_name: "docs", session_id: "sess-123", + request_source: "user", }; it("inserts a row with all fields", async () => { @@ -48,6 +56,7 @@ describe("logQuery", () => { expect(mockQuery).toHaveBeenCalledTimes(1); const [sql, params] = mockQuery.mock.calls[0]; expect(sql).toContain("INSERT INTO query_log"); + expect(sql).toContain("request_source"); expect(params).toEqual([ "search-docs", "how to install", @@ -56,6 +65,7 @@ describe("logQuery", () => { 42, "docs", "sess-123", + "user", ]); }); @@ -76,6 +86,7 @@ describe("logQuery", () => { baseEntry.latency_ms, baseEntry.source_name, baseEntry.session_id, + baseEntry.request_source, ]); // And pin the literal so the constant can never silently drift to a // different sentinel that downstream reads wouldn't recognize. @@ -96,6 +107,45 @@ describe("logQuery", () => { expect(params[5]).toBeNull(); // source_name expect(params[6]).toBeNull(); // session_id }); + + it("persists the session_id passed on the entry (no longer hardcoded null)", async () => { + // Regression for the observability gap: session_id used to be dropped + // (always null in query_log). The writer must persist whatever the tool + // handler threads through from the MCP session context. + mockQuery.mockResolvedValueOnce({ rows: [] }); + await logQuery({ ...baseEntry, session_id: "live-session-42" }); + + const [, params] = mockQuery.mock.calls[0]; + expect(params[6]).toBe("live-session-42"); + }); + + it("coerces an unknown request_source to the default ('user')", async () => { + // The column should only ever hold a known origin going forward. A bogus + // header value must not land verbatim — it normalizes to 'user'. + mockQuery.mockResolvedValueOnce({ rows: [] }); + await logQuery({ ...baseEntry, request_source: "bogus-origin" }); + + const [, params] = mockQuery.mock.calls[0]; + expect(params[7]).toBe("user"); + }); + + it("coerces an absent request_source to the default ('user')", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + const { request_source: _omit, ...noSource } = baseEntry; + void _omit; + await logQuery(noSource); + + const [, params] = mockQuery.mock.calls[0]; + expect(params[7]).toBe("user"); + }); + + it("persists a synthetic request_source verbatim", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + await logQuery({ ...baseEntry, request_source: "synthetic" }); + + const [, params] = mockQuery.mock.calls[0]; + expect(params[7]).toBe("synthetic"); + }); }); // --------------------------------------------------------------------------- @@ -305,6 +355,53 @@ describe("getAnalyticsSummary earliest_query_day", () => { }); }); +// --------------------------------------------------------------------------- +// getAnalyticsSummary numeric coercion (Finding #4) +// +// node-postgres deserializes integer/numeric columns as JS strings, while +// PGlite returns numbers. The summary path must coerce the count/avg fields +// to finite numbers so a string driver value can't leak into +// total_queries_window or produce a "NaN" rate. The `::int` casts make this +// defensive, but getTopQueries already guards its numerics the same way. +// --------------------------------------------------------------------------- + +describe("getAnalyticsSummary numeric coercion", () => { + it("coerces string-typed driver numerics (pg) to finite numbers in the response", async () => { + // Simulate node-postgres returning the windowed summary columns as + // STRINGS. Pre-coercion, total_queries_window would be the string "200" + // and the rate fields would be computed from string operands. + mockQuery + .mockResolvedValueOnce({ rows: [{ count: "1000" }] }) // total (string) + .mockResolvedValueOnce({ + rows: [ + { total: "200", empty: "10", low_confidence: "4", avg_latency: "45" }, + ], + }) // windowed summary (all strings) + .mockResolvedValueOnce({ rows: [] }) // latency rows + .mockResolvedValueOnce({ rows: [] }) // by source + .mockResolvedValueOnce({ rows: [] }) // per day + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); // earliest day + + const result = await getAnalyticsSummary(); + + // Strict number equality (===) — a string "1000" would fail toBe(1000). + expect(result.total_queries).toBe(1000); + expect(result.total_queries_window).toBe(200); + expect(result.empty_result_count_window).toBe(10); + expect(result.low_confidence_count_window).toBe(4); + expect(result.avg_latency_ms_window).toBe(45); + // Rates are real numbers, never NaN, computed from coerced operands. + expect(result.empty_result_rate_window).toBeCloseTo(10 / 200); + expect(result.low_confidence_rate_window).toBeCloseTo(4 / 200); + expect(Number.isFinite(result.empty_result_rate_window)).toBe(true); + expect(Number.isFinite(result.low_confidence_rate_window)).toBe(true); + // typeof guards lock the coercion: the response must expose numbers, not + // the driver's string passthrough. + expect(typeof result.total_queries_window).toBe("number"); + expect(typeof result.avg_latency_ms_window).toBe("number"); + }); +}); + // --------------------------------------------------------------------------- // p95 computation edge cases (tested indirectly via getAnalyticsSummary) // --------------------------------------------------------------------------- @@ -359,6 +456,61 @@ describe("p95 computation edge cases", () => { // floor(2 * 0.95) = 1, sorted[1] = 100 expect(result.p95_latency_ms_window).toBe(100); }); + + it("coerces string-typed latency rows (node-postgres) into a numeric p95", async () => { + // H2 regression: node-postgres deserializes numeric/bigint columns as + // STRINGS, so the latency rows can arrive as `{ latency_ms: "100" }`. The + // pre-fix mapping (`r.latency_ms as number`) is a no-op at runtime and left + // the strings in place, so computeP95 returned a STRING (e.g. "2000") and, + // worse, a future non-numeric-subtraction sort path could mis-order. Coerce + // each latency through toFiniteNumber/::int so p95 is always a real number. + mockQuery + .mockResolvedValueOnce({ rows: [{ count: 4 }] }) + .mockResolvedValueOnce({ + rows: [{ total: 4, empty: 0, avg_latency: 50 }], + }) + .mockResolvedValueOnce({ + rows: [ + { latency_ms: "50" }, + { latency_ms: "100" }, + { latency_ms: "1000" }, + { latency_ms: "2000" }, + ], + }) + .mockResolvedValueOnce({ rows: [] }) + .mockResolvedValueOnce({ rows: [] }) + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); + + const result = await getAnalyticsSummary(); + // floor(4 * 0.95) = 3 → sorted[3] = 2000. + expect(result.p95_latency_ms_window).toBe(2000); + // And it must be a real number, not the raw "2000" string. + expect(typeof result.p95_latency_ms_window).toBe("number"); + }); + + it("documents the n=20 boundary: floor(n*0.95)=19 selects the MAX element", async () => { + // Finding #5 (comment-accuracy lock): computeP95 uses `floor(n*0.95)`, + // which for n=20 yields index 19 — the largest sample — not the standard + // nearest-rank `ceil(0.95*n)-1 = 18`. We keep the existing behavior (the + // dashboard is calibrated to it) and only require the source comment to + // describe it accurately. This test pins the boundary so the comment and + // behavior can never drift apart: latencies 1..20 → p95 == 20. + mockQuery + .mockResolvedValueOnce({ rows: [{ count: 20 }] }) + .mockResolvedValueOnce({ + rows: [{ total: 20, empty: 0, avg_latency: 10 }], + }) + .mockResolvedValueOnce({ + rows: Array.from({ length: 20 }, (_, i) => ({ latency_ms: i + 1 })), + }) + .mockResolvedValueOnce({ rows: [] }) + .mockResolvedValueOnce({ rows: [] }) + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); + + const result = await getAnalyticsSummary(); + // floor(20 * 0.95) = 19 → sorted[19] = 20 (the max of 1..20). + expect(result.p95_latency_ms_window).toBe(20); + }); }); // --------------------------------------------------------------------------- @@ -562,10 +714,12 @@ describe("cleanupOldQueryLogs", () => { }); it("uses <= boundary so retention-edge rows aren't leaked", async () => { - // The rolling-window reads use `created_at > NOW() - INTERVAL`. If - // cleanup used a strict `<`, rows sitting exactly at the retention - // edge would be visible to reads forever but never get cleaned up. - // `<=` closes the partition so retention-edge rows are removed. + // The rolling-window reads use a UTC-calendar-day `created_at >=` + // lower bound (see buildDateWindow), while cleanup is a wall-clock + // NOW()-anchored purge. If cleanup used a strict `<`, rows sitting + // exactly at the retention edge would be visible to reads forever but + // never get cleaned up. `<=` closes the partition so retention-edge + // rows are removed. mockQuery.mockResolvedValueOnce({ rowCount: 0 }); await cleanupOldQueryLogs(90); const [sql] = mockQuery.mock.calls[0]; @@ -655,7 +809,8 @@ describe("getToolCounts", () => { const [sql, params] = mockQuery.mock.calls[0]; expect(sql).toContain("split_part(tool_name"); - expect(params).toEqual([7]); + // Default request-source filter appends "user" (real-users-by-default). + expect(params).toEqual([7, "user"]); }); it("returns empty array when no queries exist", async () => { @@ -1086,6 +1241,69 @@ describe("getAnalyticsSummary honors days window", () => { }); }); +describe("getAnalyticsSummary all-time window omits the lower bound", () => { + function mockSummaryQueries() { + mockQuery + .mockResolvedValueOnce({ rows: [{ count: 500 }] }) // total + .mockResolvedValueOnce({ + rows: [{ total: 100, empty: 5, avg_latency: 50 }], + }) // windowed summary + .mockResolvedValueOnce({ rows: [] }) // latency rows + .mockResolvedValueOnce({ rows: [] }) // by source + .mockResolvedValueOnce({ rows: [] }) // per day + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); // earliest day + } + + it("ALL_TIME_DAYS is the all-time sentinel and is < the parser cap", () => { + // The sentinel must be a concrete value the dashboard + server agree on. + // 99999 matches docs/analytics.html ALL_TIME_DAYS and stays under + // server.ts MAX_DAYS=100000 so the "All time" preset never 400s. + expect(ALL_TIME_DAYS).toBe(99999); + }); + + it("omits `created_at >=` on the summary/latency/by-source subqueries at all-time", async () => { + // Finding #1: at the all-time sentinel the windowed aggregates must NOT + // clamp to the 366-day cap — the lower bound is dropped entirely so the + // totals truly span every row. Pre-fix the LEAST($N,366) clamp was always + // present, silently undercounting installs with > 366 days of history. + mockSummaryQueries(); + await getAnalyticsSummary({}, ALL_TIME_DAYS); + + for (let i = 1; i < 4; i++) { + const [sql, params] = mockQuery.mock.calls[i]; + expect(sql).not.toContain("created_at >="); + // No clamp expression and no `days` param bound on the window. + expect(sql).not.toContain("LEAST"); + expect(params).not.toContain(ALL_TIME_DAYS); + } + }); + + it("still clamps a large-but-finite window (days < ALL_TIME_DAYS)", async () => { + // Only the sentinel uncaps. A finite 1000-day window must keep the + // UTC-calendar-day clamp so behavior for normal presets is unchanged. + mockSummaryQueries(); + await getAnalyticsSummary({}, 1000); + + const [sql, params] = mockQuery.mock.calls[1]; + expect(sql).toContain("created_at >="); + expect(sql).toContain("LEAST"); + expect(params).toContain(1000); + }); + + it("per-day series stays bounded at all-time (capped, not one bar per day of history)", async () => { + // The summary window uncaps, but the per-day chart must stay sensible: + // the series is still capped at ROLLING_WINDOW_CAP_DAYS so an install + // with years of history doesn't emit thousands of daily bars. + mockSummaryQueries(); + await getAnalyticsSummary({}, ALL_TIME_DAYS); + + const [perDaySql] = mockQuery.mock.calls[4]; + expect(perDaySql).toContain("generate_series"); + // Series lower bound is the capped UTC-calendar expression. + expect(perDaySql).toContain(String(ROLLING_WINDOW_CAP_DAYS)); + }); +}); + describe("getAnalyticsSummary with from/to range", () => { function mockSummaryQueries() { mockQuery @@ -1099,6 +1317,27 @@ describe("getAnalyticsSummary with from/to range", () => { .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); // earliest day } + it("caps the range-mode per-day series width at ROLLING_WINDOW_CAP_DAYS", async () => { + // Finding #2: range mode previously emitted an uncapped + // generate_series(from,to,'1 day'), so a multi-thousand-day range bloated + // the JSON payload with one row per day. The series upper bound must now + // be clamped so its width never exceeds ROLLING_WINDOW_CAP_DAYS. The + // summary/aggregate WHERE still honors the full user-chosen range. + mockSummaryQueries(); + const from = new Date("2017-01-01T00:00:00.000Z"); + const to = new Date("2026-04-20T23:59:59.999Z"); // ~9 years + await getAnalyticsSummary({ from, to }); + + const [perDaySql, perDayParams] = mockQuery.mock.calls[4]; + expect(perDaySql).toContain("generate_series"); + // The series is bounded by a LEAST(...) cap referencing the cap constant. + expect(perDaySql).toContain("LEAST"); + expect(perDaySql).toContain(String(ROLLING_WINDOW_CAP_DAYS)); + // The inner WHERE still binds the full range (summary spans everything). + expect(perDayParams).toContain(from); + expect(perDayParams).toContain(to); + }); + it("generates created_at >= / <= range clause and passes Date params", async () => { mockSummaryQueries(); const from = new Date("2026-04-01T00:00:00.000Z"); @@ -1211,7 +1450,8 @@ describe("getToolCounts with from/to range", () => { expect(sql).toContain("created_at >="); expect(sql).toContain("created_at <="); expect(sql).not.toContain("NOW() - INTERVAL"); - expect(params).toEqual([from, to]); + // Default request-source filter appends "user" after the range params. + expect(params).toEqual([from, to, "user"]); }); it("falls back to UTC-calendar-day rolling window when no range filter provided", async () => { @@ -1226,7 +1466,8 @@ describe("getToolCounts with from/to range", () => { expect(sql).toContain("(NOW() AT TIME ZONE 'UTC')::date"); expect(sql).toContain("LEAST"); expect(sql).not.toContain("NOW() - INTERVAL"); - expect(params).toEqual([14]); + // Default request-source filter appends "user" after the days param. + expect(params).toEqual([14, "user"]); }); }); @@ -1328,3 +1569,204 @@ describe("getTopQueries null avg_result_count", () => { expect(result[0].avg_top_score).toBe(0); }); }); + +// --------------------------------------------------------------------------- +// normalizeRequestSource +// --------------------------------------------------------------------------- + +describe("normalizeRequestSource", () => { + it("passes through the canonical values", () => { + for (const v of REQUEST_SOURCE_VALUES) { + expect(normalizeRequestSource(v)).toBe(v); + } + }); + + it("lower-cases and trims before matching", () => { + expect(normalizeRequestSource(" Synthetic ")).toBe("synthetic"); + expect(normalizeRequestSource("ANALYSIS")).toBe("analysis"); + }); + + it("falls back to the default for unknown/absent values", () => { + expect(normalizeRequestSource("robot")).toBe(DEFAULT_REQUEST_SOURCE); + expect(normalizeRequestSource("")).toBe(DEFAULT_REQUEST_SOURCE); + expect(normalizeRequestSource(undefined)).toBe(DEFAULT_REQUEST_SOURCE); + expect(normalizeRequestSource(null)).toBe(DEFAULT_REQUEST_SOURCE); + }); + + it("default is 'user'", () => { + expect(DEFAULT_REQUEST_SOURCE).toBe("user"); + }); +}); + +// --------------------------------------------------------------------------- +// Low-confidence metric: result_count > 0 AND top_score < threshold +// --------------------------------------------------------------------------- + +describe("getAnalyticsSummary low-confidence metric", () => { + function mockSummaryQueries( + summaryRow: Record = { + total: 100, + empty: 5, + low_confidence: 12, + avg_latency: 50, + }, + ) { + mockQuery + .mockResolvedValueOnce({ rows: [{ count: 500 }] }) // total + .mockResolvedValueOnce({ rows: [summaryRow] }) // windowed summary + .mockResolvedValueOnce({ rows: [] }) // latency rows + .mockResolvedValueOnce({ rows: [] }) // by source + .mockResolvedValueOnce({ rows: [] }) // per day + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); // earliest day + } + + it("surfaces low_confidence_count_window and rate from the summary subquery", async () => { + mockSummaryQueries(); + const result = await getAnalyticsSummary({}); + + expect(result.low_confidence_count_window).toBe(12); + // 12 / 100 + expect(result.low_confidence_rate_window).toBeCloseTo(0.12); + }); + + it("low_confidence_rate_window is 0 when the window has no rows", async () => { + mockSummaryQueries({ + total: 0, + empty: 0, + low_confidence: 0, + avg_latency: 0, + }); + const result = await getAnalyticsSummary({}); + + expect(result.low_confidence_count_window).toBe(0); + expect(result.low_confidence_rate_window).toBe(0); + }); + + it("counts low confidence as result_count > 0 AND top_score < threshold (NULL excluded)", async () => { + // The summary subquery must encode the exact predicate the brief calls + // for, and bind the threshold constant rather than inlining 0.5 so the + // module constant is the single source of truth. top_score IS NOT NULL is + // part of the FILTER so browse/keyword rows (no score) don't count. + mockSummaryQueries(); + await getAnalyticsSummary({}); + + // Index 1 is the summary subquery. + const [sql, params] = mockQuery.mock.calls[1]; + expect(sql).toMatch(/result_count > 0/); + expect(sql).toMatch(/top_score IS NOT NULL/); + expect(sql).toMatch(/top_score < \$\d+/); + expect(params).toContain(LOW_CONFIDENCE_SCORE_THRESHOLD); + }); + + it("threshold constant is 0.5 (matches the brief)", () => { + expect(LOW_CONFIDENCE_SCORE_THRESHOLD).toBe(0.5); + }); +}); + +// --------------------------------------------------------------------------- +// Request-source audience filtering (default = real users) +// --------------------------------------------------------------------------- + +describe("getAnalyticsSummary request-source audience", () => { + function mockSummaryQueries() { + mockQuery + .mockResolvedValueOnce({ rows: [{ count: 500 }] }) // total + .mockResolvedValueOnce({ + rows: [{ total: 100, empty: 5, low_confidence: 0, avg_latency: 50 }], + }) // windowed summary + .mockResolvedValueOnce({ rows: [] }) // latency rows + .mockResolvedValueOnce({ rows: [] }) // by source + .mockResolvedValueOnce({ rows: [] }) // per day + .mockResolvedValueOnce({ rows: [{ earliest_day: null }] }); // earliest day + } + + it("defaults to real users: every windowed subquery includes (request_source = 'user' OR IS NULL)", async () => { + // No request_source on the filter → KPIs count real users only, but still + // include untagged historical rows (request_source IS NULL) so the + // back-compat guarantee holds. + mockSummaryQueries(); + await getAnalyticsSummary({}); + + // Indexes 1..4 are the windowed subqueries (summary, latency, by-source, + // per-day). Each must carry the default request-source clause + bind "user". + for (let i = 1; i <= 4; i++) { + const [sql, params] = mockQuery.mock.calls[i]; + expect(sql).toContain("request_source = $"); + expect(sql).toContain("request_source IS NULL"); + expect(params).toContain("user"); + } + }); + + it("total_queries (index 0) is all-time and unaffected by the request-source default", async () => { + // The all-time total card counts every row regardless of origin — only the + // windowed cards default to real users. + mockSummaryQueries(); + await getAnalyticsSummary({}); + + const [sql] = mockQuery.mock.calls[0]; + expect(sql).not.toContain("request_source"); + }); + + it("request_source: 'all' applies NO request-source clause (every origin)", async () => { + mockSummaryQueries(); + await getAnalyticsSummary({ request_source: "all" }); + + for (let i = 1; i <= 4; i++) { + const [sql] = mockQuery.mock.calls[i]; + expect(sql).not.toContain("request_source"); + } + }); + + it("request_source: 'synthetic' uses an exact-match clause (NULL excluded)", async () => { + mockSummaryQueries(); + await getAnalyticsSummary({ request_source: "synthetic" }); + + for (let i = 1; i <= 4; i++) { + const [sql, params] = mockQuery.mock.calls[i]; + expect(sql).toContain("request_source = $"); + // Exact match only — NULL rows are real users, not synthetic. + expect(sql).not.toContain("request_source IS NULL"); + expect(params).toContain("synthetic"); + } + }); +}); + +describe("request-source clause on top/empty/tool-count readers", () => { + it("getTopQueries defaults to real users (request_source = 'user' OR IS NULL)", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + await getTopQueries(7, 50); + + const [sql, params] = mockQuery.mock.calls[0]; + expect(sql).toContain("request_source = $"); + expect(sql).toContain("request_source IS NULL"); + expect(params).toContain("user"); + }); + + it("getEmptyQueries defaults to real users", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + await getEmptyQueries(7, 50); + + const [sql, params] = mockQuery.mock.calls[0]; + expect(sql).toContain("request_source = $"); + expect(sql).toContain("request_source IS NULL"); + expect(params).toContain("user"); + }); + + it("getToolCounts honors request_source: 'analysis' as exact match", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + await getToolCounts(7, { request_source: "analysis" }); + + const [sql, params] = mockQuery.mock.calls[0]; + expect(sql).toContain("request_source = $"); + expect(sql).not.toContain("request_source IS NULL"); + expect(params).toContain("analysis"); + }); + + it("getTopQueries with request_source: 'all' applies no request-source clause", async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + await getTopQueries(7, 50, { request_source: "all" }); + + const [sql] = mockQuery.mock.calls[0]; + expect(sql).not.toContain("request_source"); + }); +}); diff --git a/src/__tests__/faq-browse-ordering.test.ts b/src/__tests__/faq-browse-ordering.test.ts new file mode 100644 index 0000000..fbecef1 --- /dev/null +++ b/src/__tests__/faq-browse-ordering.test.ts @@ -0,0 +1,145 @@ +import { describe, it, expect, beforeAll, afterAll, beforeEach } from "vitest"; +import { PGlite } from "@electric-sql/pglite"; +import { __setPoolForTesting, __resetPoolForTesting } from "../db/client.js"; +import { getFaqChunks } from "../db/queries.js"; + +// Behavioral (real in-process PGlite) test for the getFaqChunks browse LIMIT. +// +// The browse listing documents "most-recent N across all queried sources". The +// fix replaced a `source_name`-leading ORDER BY with `ORDER BY indexed_at DESC, +// id DESC` so a global LIMIT is not consumed entirely by the alphabetically- +// first source — starving more-recent rows from later sources to zero. A mock +// pool can only echo a hand-picked row and so cannot exercise ORDER BY / LIMIT +// semantics; only a real engine proves the ordering. We construct the adversarial +// case: the alphabetically-LATER source ("slack-support") holds the most-recent +// rows, the alphabetically-FIRST source ("discord-faq") holds older rows, and a +// small LIMIT must return the globally-most-recent rows regardless of source. + +// Minimal chunks DDL — only the columns getFaqChunks selects. indexed_at drives +// the recency ordering under test. +const CHUNKS_DDL = ` + CREATE TABLE chunks ( + id SERIAL PRIMARY KEY, + source_name TEXT NOT NULL, + source_url TEXT, + title TEXT, + content TEXT NOT NULL, + repo_url TEXT, + file_path TEXT NOT NULL, + start_line INT, + end_line INT, + language TEXT, + metadata JSONB NOT NULL DEFAULT '{}', + indexed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); +`; + +function poolFromPglite(db: PGlite) { + return { + query: (text: string, params?: unknown[]) => db.query(text, params), + connect: async () => ({ + query: (text: string, params?: unknown[]) => db.query(text, params), + release: () => {}, + }), + end: async () => db.close(), + }; +} + +async function insertChunk( + db: PGlite, + sourceName: string, + filePath: string, + indexedAtIso: string, +): Promise { + const { rows } = await db.query<{ id: number }>( + `INSERT INTO chunks (source_name, content, file_path, metadata, indexed_at) + VALUES ($1, $2, $3, $4, $5) RETURNING id`, + [ + sourceName, + "Q: x\n\nA: y", + filePath, + JSON.stringify({ confidence: 0.9 }), + indexedAtIso, + ], + ); + return rows[0].id; +} + +describe("getFaqChunks browse ordering (PGlite integration)", () => { + let db: PGlite; + + beforeAll(async () => { + db = new PGlite(); + await db.waitReady; + await db.exec(CHUNKS_DDL); + __setPoolForTesting(poolFromPglite(db)); + }); + + afterAll(async () => { + __resetPoolForTesting(); + await db.close(); + }); + + beforeEach(async () => { + await db.query("DELETE FROM chunks"); + }); + + it("returns the globally-most-recent rows across sources under a small LIMIT (not starved by the alphabetically-first source)", async () => { + // "discord-faq" sorts alphabetically BEFORE "slack-support". Make its rows + // the OLDEST so a source_name-leading ORDER BY would (wrongly) return them + // first and starve the more-recent slack-support rows under a small LIMIT. + await insertChunk(db, "discord-faq", "d-old-1.md", "2024-01-01T00:00:00Z"); + await insertChunk(db, "discord-faq", "d-old-2.md", "2024-01-02T00:00:00Z"); + // "slack-support" holds the two globally-most-recent rows. + await insertChunk( + db, + "slack-support", + "s-new-1.md", + "2024-06-01T00:00:00Z", + ); + await insertChunk( + db, + "slack-support", + "s-new-2.md", + "2024-06-02T00:00:00Z", + ); + + const rows = await getFaqChunks(["discord-faq", "slack-support"], 0.5, 2); + + // With global-recency ordering, the two returned rows are the most-recent + // overall — both from the alphabetically-LATER source. A source_name-leading + // ORDER BY would instead return the two discord-faq rows. + expect(rows).toHaveLength(2); + const paths = rows.map((r) => r.file_path); + expect(paths).toEqual(["s-new-2.md", "s-new-1.md"]); + // Sanity: the older alphabetically-first source is NOT in the limited result. + expect(paths).not.toContain("d-old-1.md"); + expect(paths).not.toContain("d-old-2.md"); + }); + + it("orders the full multi-source result by global recency, then id DESC", async () => { + // Interleave recency across sources to prove the ordering is global, not + // grouped by source. id DESC is the deterministic tie-breaker for equal + // indexed_at. + const tie = "2024-03-03T00:00:00Z"; + await insertChunk(db, "slack-support", "newest.md", "2024-09-09T00:00:00Z"); + await insertChunk(db, "discord-faq", "mid.md", "2024-05-05T00:00:00Z"); + const tieEarlyId = await insertChunk(db, "discord-faq", "tie-a.md", tie); + const tieLateId = await insertChunk(db, "slack-support", "tie-b.md", tie); + await insertChunk(db, "slack-support", "oldest.md", "2024-01-01T00:00:00Z"); + + const rows = await getFaqChunks(["discord-faq", "slack-support"], 0.5); + const paths = rows.map((r) => r.file_path); + + // Newest first, oldest last; the two equal-indexed_at rows are ordered by + // id DESC (tie-b inserted after tie-a → higher id → comes first). + expect(tieLateId).toBeGreaterThan(tieEarlyId); + expect(paths).toEqual([ + "newest.md", + "mid.md", + "tie-b.md", + "tie-a.md", + "oldest.md", + ]); + }); +}); diff --git a/src/__tests__/faq-confidence-cast.test.ts b/src/__tests__/faq-confidence-cast.test.ts new file mode 100644 index 0000000..5722f5f --- /dev/null +++ b/src/__tests__/faq-confidence-cast.test.ts @@ -0,0 +1,157 @@ +import { describe, it, expect, beforeAll, afterAll, beforeEach } from "vitest"; +import { PGlite } from "@electric-sql/pglite"; +import { __setPoolForTesting, __resetPoolForTesting } from "../db/client.js"; +import { getFaqChunks, getFaqChunksByIds } from "../db/queries.js"; + +// Integration test (real in-process PGlite) for the getFaqChunksByIds +// confidence-cast guard. A single FAQ row whose metadata.confidence is +// non-numeric text (e.g. "high") must NOT fail the whole id lookup with +// `invalid input syntax for type double precision` — the malformed row should +// degrade to 0.0 confidence while every other row in the id set still returns. +// +// Mock-pool unit tests can only assert the SQL string; the actual cast crash +// only reproduces against a real Postgres engine, so we run the query +// end-to-end here. + +// Minimal chunks DDL — only the columns getFaqChunksByIds selects (no pgvector +// extension / embedding column needed, since the by-id lookup never touches the +// vector). +const CHUNKS_DDL = ` + CREATE TABLE chunks ( + id SERIAL PRIMARY KEY, + source_name TEXT NOT NULL, + source_url TEXT, + title TEXT, + content TEXT NOT NULL, + repo_url TEXT, + file_path TEXT NOT NULL, + start_line INT, + end_line INT, + language TEXT, + metadata JSONB NOT NULL DEFAULT '{}', + indexed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); +`; + +function poolFromPglite(db: PGlite) { + return { + query: (text: string, params?: unknown[]) => db.query(text, params), + connect: async () => ({ + query: (text: string, params?: unknown[]) => db.query(text, params), + release: () => {}, + }), + end: async () => db.close(), + }; +} + +async function insertChunk( + db: PGlite, + filePath: string, + metadata: Record, +): Promise { + const { rows } = await db.query<{ id: number }>( + `INSERT INTO chunks (source_name, content, file_path, metadata) + VALUES ($1, $2, $3, $4) RETURNING id`, + ["slack-faq", "Q: x\n\nA: y", filePath, JSON.stringify(metadata)], + ); + return rows[0].id; +} + +describe("getFaqChunksByIds confidence-cast guard (PGlite integration)", () => { + let db: PGlite; + + beforeAll(async () => { + db = new PGlite(); + await db.waitReady; + await db.exec(CHUNKS_DDL); + __setPoolForTesting(poolFromPglite(db)); + }); + + afterAll(async () => { + __resetPoolForTesting(); + await db.close(); + }); + + beforeEach(async () => { + await db.query("DELETE FROM chunks"); + }); + + it("does not crash when one row in the id set has a non-numeric confidence", async () => { + const goodId = await insertChunk(db, "good.md", { confidence: 0.85 }); + const badId = await insertChunk(db, "bad.md", { confidence: "high" }); + + // The whole lookup must succeed (not reject with "invalid input syntax for + // type double precision"). + const rows = await getFaqChunksByIds([goodId, badId]); + + expect(rows).toHaveLength(2); + const byId = new Map(rows.map((r) => [r.id, r])); + // The well-formed numeric row keeps its value. + expect(byId.get(goodId)!.confidence).toBe(0.85); + // The malformed row degrades to 0.0 rather than crashing the query. + expect(byId.get(badId)!.confidence).toBe(0.0); + }); + + it("treats a missing confidence key as 0.0", async () => { + const id = await insertChunk(db, "nometa.md", { channel: "C1" }); + const rows = await getFaqChunksByIds([id]); + expect(rows).toHaveLength(1); + expect(rows[0].confidence).toBe(0.0); + }); +}); + +// getFaqChunks (the browse listing) has the same UNGUARDED confidence cast in +// BOTH its projection and its WHERE clause. Unlike getFaqChunksByIds, it filters +// `metadata ? 'confidence'` — but a row whose `confidence` KEY EXISTS yet holds +// non-numeric text (e.g. "high") still passes that key check and then crashes +// the `(metadata->>'confidence')::float` cast with "invalid input syntax for +// type double precision", taking down the WHOLE browse listing. Both casts must +// be jsonb_typeof-guarded like getFaqChunksByIds so one bad row degrades to 0.0 +// instead of rejecting every entry. +describe("getFaqChunks confidence-cast guard (PGlite integration)", () => { + let db: PGlite; + + beforeAll(async () => { + db = new PGlite(); + await db.waitReady; + await db.exec(CHUNKS_DDL); + __setPoolForTesting(poolFromPglite(db)); + }); + + afterAll(async () => { + __resetPoolForTesting(); + await db.close(); + }); + + beforeEach(async () => { + await db.query("DELETE FROM chunks"); + }); + + it("does not crash the browse listing when a row has a non-numeric confidence", async () => { + // Both rows have the `confidence` key (so both pass `metadata ? 'confidence'`), + // but one is the string "high". The WHERE cast would crash the whole query. + await insertChunk(db, "good.md", { confidence: 0.85 }); + await insertChunk(db, "bad.md", { confidence: "high" }); + + // The listing must succeed (not reject with "invalid input syntax for type + // double precision"). The malformed row degrades to 0.0 confidence and is + // filtered out by the minConfidence threshold; the good row survives. + const rows = await getFaqChunks(["slack-faq"], 0.5); + + const byPath = new Map(rows.map((r) => [r.file_path, r])); + expect(byPath.has("good.md")).toBe(true); + expect(byPath.get("good.md")!.confidence).toBe(0.85); + // The "high" row degrades to 0.0 < 0.5, so it's excluded — but crucially the + // query did not crash. + expect(byPath.has("bad.md")).toBe(false); + }); + + it("includes a degraded-to-0.0 row when the confidence threshold is 0", async () => { + await insertChunk(db, "bad.md", { confidence: "high" }); + // threshold 0 admits the degraded 0.0 row; the point is the query runs. + const rows = await getFaqChunks(["slack-faq"], 0); + const bad = rows.find((r) => r.file_path === "bad.md"); + expect(bad).toBeDefined(); + expect(bad!.confidence).toBe(0.0); + }); +}); diff --git a/src/__tests__/faq-queries.test.ts b/src/__tests__/faq-queries.test.ts index 5fa8fe9..996df0e 100644 --- a/src/__tests__/faq-queries.test.ts +++ b/src/__tests__/faq-queries.test.ts @@ -27,7 +27,11 @@ describe("getFaqChunks", () => { expect(mockQuery).toHaveBeenCalledOnce(); const [sql, params] = mockQuery.mock.calls[0]; expect(sql).toContain("source_name IN ($1, $2)"); - expect(sql).toContain("(metadata->>'confidence')::float >= $3"); + // The confidence comparison is jsonb_typeof-guarded so a non-numeric + // confidence (e.g. "high") degrades to 0.0 instead of crashing the cast. + expect(sql).toContain("jsonb_typeof(metadata->'confidence') = 'number'"); + expect(sql).toContain("(metadata->>'confidence')::float"); + expect(sql).toContain(">= $3"); expect(params).toEqual(["slack-support", "slack-general", 0.8]); }); @@ -70,11 +74,23 @@ describe("getFaqChunks", () => { expect(results[0].source_name).toBe("slack-support"); }); - it("orders by source_name then indexed_at DESC", async () => { + it("orders by indexed_at DESC then id DESC (global recency, NOT source-grouped)", async () => { mockQuery.mockResolvedValueOnce({ rows: [] }); await getFaqChunks(["slack-support"], 0.5); const [sql] = mockQuery.mock.calls[0]; - expect(sql).toContain("ORDER BY source_name, indexed_at DESC"); + // Global-recency ordering: source_name must NOT be the leading ORDER BY key. + // With source_name leading, a global LIMIT across multiple browse sources is + // consumed entirely by the alphabetically-first source, starving later + // sources to zero regardless of recency. The browse path documents + // "most-recent N across all sources", which requires indexed_at to lead. + expect(sql).toContain("ORDER BY indexed_at DESC, id DESC"); + expect(sql).not.toMatch(/ORDER BY\s+source_name/); }); + + // NOTE: the behavioral proof that a small LIMIT returns the globally-most- + // recent rows across sources (rather than being starved by the alphabetically + // -first source) lives in faq-browse-ordering.test.ts, which runs the query + // end-to-end against a real PGlite engine. A mock pool can only echo a hand- + // picked row, so it cannot actually exercise ORDER BY / LIMIT semantics. }); diff --git a/src/__tests__/file-provider.test.ts b/src/__tests__/file-provider.test.ts index 2a4b622..842981e 100644 --- a/src/__tests__/file-provider.test.ts +++ b/src/__tests__/file-provider.test.ts @@ -2,7 +2,10 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; import fs from "node:fs"; import path from "node:path"; import os from "node:os"; -import { FileDataProvider } from "../indexing/providers/file.js"; +import { + FileDataProvider, + localFileHashInput, +} from "../indexing/providers/file.js"; import type { FileSourceConfig } from "../types.js"; // --------------------------------------------------------------------------- @@ -594,6 +597,78 @@ describe("FileDataProvider", () => { expect(result.removedIds).toEqual(["docs/deleted.md"]); }); + it("throws (rather than silently dropping deletions) when the deletion-detection diff fails", async () => { + // H1 regression: the changed-files diff (--name-only) succeeds, but the + // deletion-detection diff (--name-status) fails transiently. Previously + // its catch set removedFiles=[] and continued, so a transient git error + // masqueraded as "no deletions" while the caller still advanced the state + // token — leaving stale/deleted docs in the index forever. The failure + // must now be surfaced (thrown) so the orchestrator marks the run errored + // and holds the token for retry instead of advancing over undetected + // deletions. + const cloneDir = path.join(tmpDir, "clones"); + const repoDir = path.join(cloneDir, "repo"); + const docsDir = path.join(repoDir, "docs"); + await fs.promises.mkdir(path.join(repoDir, ".git"), { recursive: true }); + await fs.promises.mkdir(docsDir, { recursive: true }); + await fs.promises.writeFile( + path.join(docsDir, "updated.md"), + "# Updated", + ); + + mockGitInstance.revparse.mockResolvedValue("def456"); + mockGitInstance.diff + .mockResolvedValueOnce("docs/updated.md\n") // --name-only succeeds + .mockRejectedValueOnce(new Error("name-status diff failed")); // --name-status fails + + const provider = new FileDataProvider(makeGitConfig(), { cloneDir }); + await expect(provider.incrementalAcquire("abc123")).rejects.toThrow( + /name-status diff failed|deletion detection/i, + ); + }); + + it("does NOT remove a changed file that exists on disk but fails extractContent", async () => { + // A transient read/extraction failure (EACCES/EIO/parse error) on a file + // that still exists on disk must NOT be folded into removedIds (which the + // orchestrator DELETES, then advances the token over — permanent silent + // chunk loss, since a later incremental diff won't re-list an unchanged + // file). Mirror the fullAcquire 'does NOT remove files that exist on disk + // but fail extractContent' guarantee: surface the failure (throw) so the + // orchestrator holds the prior token for retry instead of deleting. + const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + + const cloneDir = path.join(tmpDir, "clones"); + const repoDir = path.join(cloneDir, "repo"); + const docsDir = path.join(repoDir, "docs"); + await fs.promises.mkdir(path.join(repoDir, ".git"), { recursive: true }); + await fs.promises.mkdir(docsDir, { recursive: true }); + // The changed file EXISTS on disk so the fs.existsSync guard passes and we + // proceed to extractContent (which throws). + await fs.promises.writeFile( + path.join(docsDir, "updated.md"), + "# Updated", + ); + + mockGitInstance.revparse.mockResolvedValue("def456"); + mockGitInstance.diff + .mockResolvedValueOnce("docs/updated.md\n") // --name-only + .mockResolvedValueOnce("M\tdocs/updated.md\n"); // --name-status (modified, not deleted) + mockExtractContent.mockRejectedValueOnce( + new Error("Transient I/O error"), + ); + + const provider = new FileDataProvider(makeGitConfig(), { cloneDir }); + + // The acquire must surface the failure (hold the token), not silently + // advance over a file whose chunks would be deleted. + await expect(provider.incrementalAcquire("abc123")).rejects.toThrow( + /read\/extraction failed|holding state token/i, + ); + + mockExtractContent.mockReset(); + errorSpy.mockRestore(); + }); + it("returns empty when no matching changes detected", async () => { const cloneDir = path.join(tmpDir, "clones"); const repoDir = path.join(cloneDir, "repo"); @@ -753,6 +828,36 @@ describe("FileDataProvider", () => { expect(ids).toContain("docs/new-name.md"); }); + it("detects a removal even when no matching changes are listed (rename of a matched file to a non-matched extension)", async () => { + // Data-consistency hole: a commit whose ONLY matching-relevant change is a + // rename of a MATCHED file (docs/a.md, matched by **/*.md) to a NON-matched + // extension (docs/b.txt). `git diff --name-only` lists only the new + // non-matching path (docs/b.txt), so matchingChanged is empty. If the + // no-matching-changes short-circuit fires BEFORE deletion detection, the + // run returns removedIds: [] and advances the state token, stranding + // docs/a.md's chunks in the index forever. Deletion/rename detection must + // run first so docs/a.md's removal is reported. + const cloneDir = path.join(tmpDir, "clones"); + const repoDir = path.join(cloneDir, "repo"); + await fs.promises.mkdir(path.join(repoDir, ".git"), { recursive: true }); + + mockGitInstance.revparse.mockResolvedValue("def456"); + mockGitInstance.diff + // --name-only lists only the new, non-matching path → matchingChanged empty + .mockResolvedValueOnce("docs/b.txt\n") + // --name-status records the rename of the matched .md to the .txt + .mockResolvedValueOnce("R100\tdocs/a.md\tdocs/b.txt\n"); + + const provider = new FileDataProvider(makeGitConfig(), { cloneDir }); + const result = await provider.incrementalAcquire("abc123"); + + // The removal of the matched old path MUST be reported (not an empty + // early-return), and the token still advances since the removal is handled. + expect(result.removedIds).toContain("docs/a.md"); + expect(result.items).toEqual([]); + expect(result.stateToken).toBe("def456"); + }); + it("skips files that no longer exist on disk", async () => { const cloneDir = path.join(tmpDir, "clones"); const repoDir = path.join(cloneDir, "repo"); @@ -769,6 +874,61 @@ describe("FileDataProvider", () => { expect(result.items).toEqual([]); }); + + // ── path: "." (repo-root) scoping regression ─────────────────────────── + // A source configured with `path: "."` walks the repo root. git-diff paths + // are repo-root-relative with NO leading "./", so a prefix of "./" (derived + // naively from the truthy "." path) filters out EVERY changed file — + // matchingChanged becomes empty, the incremental run indexes nothing, and + // the state token advances over real changes. "." (and "") must be treated + // as "no prefix" so repo-root changes are indexed. The deploy config's + // `code` and `ag-ui-code` sources both use `path: "."`, so this is the + // production trigger. Mirrors the guard already in reindex-audit.ts. + + it("indexes a repo-root changed file when path is '.' (no leading ./)", async () => { + const cloneDir = path.join(tmpDir, "clones"); + const repoDir = path.join(cloneDir, "repo"); + await fs.promises.mkdir(path.join(repoDir, ".git"), { recursive: true }); + // A changed file at the repo ROOT — no docs/ prefix. + await fs.promises.writeFile( + path.join(repoDir, "root-file.md"), + "# Root content", + ); + + mockGitInstance.revparse.mockResolvedValue("def456"); + mockGitInstance.diff + .mockResolvedValueOnce("root-file.md\n") // --name-only (repo-root relative) + .mockResolvedValueOnce("M\troot-file.md\n"); // --name-status + + const provider = new FileDataProvider(makeGitConfig({ path: "." }), { + cloneDir, + }); + const result = await provider.incrementalAcquire("abc123"); + + expect(result.stateToken).toBe("def456"); + expect(result.items.length).toBe(1); + expect(result.items[0].id).toBe("root-file.md"); + }); + + it("detects a repo-root deletion when path is '.' (no leading ./)", async () => { + const cloneDir = path.join(tmpDir, "clones"); + const repoDir = path.join(cloneDir, "repo"); + await fs.promises.mkdir(path.join(repoDir, ".git"), { recursive: true }); + + mockGitInstance.revparse.mockResolvedValue("def456"); + mockGitInstance.diff + .mockResolvedValueOnce("deleted-root.md\n") // --name-only + .mockResolvedValueOnce("D\tdeleted-root.md\n"); // --name-status + + const provider = new FileDataProvider(makeGitConfig({ path: "." }), { + cloneDir, + }); + const result = await provider.incrementalAcquire("abc123"); + + expect(result.stateToken).toBe("def456"); + expect(result.items).toEqual([]); + expect(result.removedIds).toEqual(["deleted-root.md"]); + }); }); // ----------------------------------------------------------------------- @@ -1011,3 +1171,49 @@ describe("FileDataProvider", () => { }); }); }); + +// --------------------------------------------------------------------------- +// localFileHashInput — pure hash-input construction for local change detection +// +// The local state token must detect content changes that preserve mtime +// (cp -p, some git checkout/restore, rsync --times). The bug: an mtime-only +// hash input leaves such edits undetected, so they are never re-indexed. +// Including file size in the hash input fixes the common mtime-preserving +// case (which almost always changes size). These tests pin path + mtime and +// vary only size, isolating the size dimension the previous implementation +// ignored. +// --------------------------------------------------------------------------- + +describe("localFileHashInput", () => { + it("produces DIFFERENT input for same path + mtime but different size", () => { + // The mtime-preserving-edit scenario: identical path and mtime, the only + // difference is file size. An mtime-only hash input would collide here. + const a = localFileHashInput("docs/readme.md", 1_700_000_000_000, 100); + const b = localFileHashInput("docs/readme.md", 1_700_000_000_000, 200); + expect(a).not.toBe(b); + }); + + it("produces IDENTICAL input for same path + mtime + size", () => { + const a = localFileHashInput("docs/readme.md", 1_700_000_000_000, 100); + const b = localFileHashInput("docs/readme.md", 1_700_000_000_000, 100); + expect(a).toBe(b); + }); + + it("still produces DIFFERENT input when mtime changes (size equal)", () => { + const a = localFileHashInput("docs/readme.md", 1_700_000_000_000, 100); + const b = localFileHashInput("docs/readme.md", 1_700_000_000_001, 100); + expect(a).not.toBe(b); + }); + + it("produces DIFFERENT input for different paths", () => { + const a = localFileHashInput("docs/a.md", 1_700_000_000_000, 100); + const b = localFileHashInput("docs/b.md", 1_700_000_000_000, 100); + expect(a).not.toBe(b); + }); + + it("includes size in the serialized form (path:mtime:size)", () => { + expect(localFileHashInput("docs/readme.md", 42, 100)).toBe( + "docs/readme.md:42:100\n", + ); + }); +}); diff --git a/src/__tests__/knowledge-mcp.test.ts b/src/__tests__/knowledge-mcp.test.ts index 72ea9f3..8d2a7d5 100644 --- a/src/__tests__/knowledge-mcp.test.ts +++ b/src/__tests__/knowledge-mcp.test.ts @@ -18,6 +18,7 @@ import type { vi.mock("../db/queries.js", () => ({ getFaqChunks: vi.fn(), + getFaqChunksByIds: vi.fn(), searchChunks: vi.fn(), })); vi.mock("../db/analytics.js", () => ({ @@ -29,9 +30,14 @@ vi.mock("../config.js", () => ({ })); import { registerKnowledgeTool } from "../mcp/tools/knowledge.js"; -import { getFaqChunks, searchChunks } from "../db/queries.js"; +import { + getFaqChunks, + getFaqChunksByIds, + searchChunks, +} from "../db/queries.js"; const mockGetFaqChunks = vi.mocked(getFaqChunks); +const mockGetFaqChunksByIds = vi.mocked(getFaqChunksByIds); const mockSearchChunks = vi.mocked(searchChunks); const mockEmbed = vi.fn(); @@ -251,8 +257,8 @@ describe("knowledge tool search mode (with query)", () => { .mockResolvedValueOnce([makeChunkResult({ id: 10, similarity: 0.95 })]) .mockResolvedValueOnce([makeChunkResult({ id: 20, similarity: 0.85 })]); - // getFaqChunks returns FAQ metadata for cross-reference - mockGetFaqChunks.mockResolvedValueOnce([ + // FAQ metadata is fetched by the EXACT vector-result ids for cross-reference + mockGetFaqChunksByIds.mockResolvedValueOnce([ makeFaqResult({ id: 10, confidence: 0.9, title: "Matched FAQ" }), makeFaqResult({ id: 20, confidence: 0.8, title: "Another FAQ" }), ]); @@ -270,20 +276,20 @@ describe("knowledge tool search mode (with query)", () => { expect(text).toContain("Q&A 2"); expect(mockEmbed).toHaveBeenCalledWith("how to auth"); - // searchChunks called once for each source + // searchChunks called once for each source. The per-source fetch OVER-fetches + // candidates (effectiveLimit * 2 = 40) so the confidence filter has a + // backfill pool — filtering before the final slice is what stops the tool + // returning fewer than `limit` results. expect(mockSearchChunks).toHaveBeenCalledTimes(2); expect(mockSearchChunks).toHaveBeenCalledWith( embedding, - 20, + 40, "slack-support", ); - expect(mockSearchChunks).toHaveBeenCalledWith(embedding, 20, "discord-faq"); - // getFaqChunks called with confidence=0, limit=100 (effectiveLimit*5) - expect(mockGetFaqChunks).toHaveBeenCalledWith( - ["slack-support", "discord-faq"], - 0, - 100, - ); + expect(mockSearchChunks).toHaveBeenCalledWith(embedding, 40, "discord-faq"); + // FAQ metadata fetched by the exact candidate ids, NOT a recency window. + expect(mockGetFaqChunksByIds).toHaveBeenCalledWith([10, 20]); + expect(mockGetFaqChunks).not.toHaveBeenCalled(); }); it("filters out search results whose FAQ confidence is below threshold", async () => { @@ -295,7 +301,7 @@ describe("knowledge tool search mode (with query)", () => { ]) .mockResolvedValueOnce([]); - mockGetFaqChunks.mockResolvedValueOnce([ + mockGetFaqChunksByIds.mockResolvedValueOnce([ makeFaqResult({ id: 10, confidence: 0.9, title: "High Confidence" }), makeFaqResult({ id: 11, confidence: 0.3, title: "Low Confidence" }), ]); @@ -318,7 +324,7 @@ describe("knowledge tool search mode (with query)", () => { .mockResolvedValueOnce([]); // FAQ data does not include id=99 - mockGetFaqChunks.mockResolvedValueOnce([ + mockGetFaqChunksByIds.mockResolvedValueOnce([ makeFaqResult({ id: 1, confidence: 0.9 }), ]); @@ -339,7 +345,7 @@ describe("knowledge tool search mode (with query)", () => { .mockResolvedValueOnce([makeChunkResult({ id: 10, similarity: 0.7 })]) .mockResolvedValueOnce([makeChunkResult({ id: 20, similarity: 0.95 })]); - mockGetFaqChunks.mockResolvedValueOnce([ + mockGetFaqChunksByIds.mockResolvedValueOnce([ makeFaqResult({ id: 10, confidence: 0.9, title: "Lower Sim" }), makeFaqResult({ id: 20, confidence: 0.9, title: "Higher Sim" }), ]); @@ -359,26 +365,33 @@ describe("knowledge tool search mode (with query)", () => { it("respects custom limit in search mode", async () => { mockEmbed.mockResolvedValueOnce([0.1]); - mockSearchChunks.mockResolvedValueOnce([]).mockResolvedValueOnce([]); - mockGetFaqChunks.mockResolvedValueOnce([]); + mockSearchChunks + .mockResolvedValueOnce([ + makeChunkResult({ id: 30, similarity: 0.95 }), + makeChunkResult({ id: 31, similarity: 0.9 }), + makeChunkResult({ id: 32, similarity: 0.85 }), + makeChunkResult({ id: 33, similarity: 0.8 }), + ]) + .mockResolvedValueOnce([]); + mockGetFaqChunksByIds.mockResolvedValueOnce([]); await client.callTool({ name: "get-faq", arguments: { query: "test", limit: 3 }, }); - // searchChunks should use limit=3 + // searchChunks OVER-fetches candidates: effectiveLimit (3) * 2 = 6, so the + // confidence filter has a backfill pool before the final slice to 3. expect(mockSearchChunks).toHaveBeenCalledWith( expect.anything(), - 3, + 6, "slack-support", ); - // getFaqChunks should use limit=15 (3*5) - expect(mockGetFaqChunks).toHaveBeenCalledWith( - ["slack-support", "discord-faq"], - 0, - 15, - ); + // FAQ metadata is fetched for ALL candidate ids (the lookup happens BEFORE + // the confidence filter + slice, which is what enables backfill), by exact + // id — no relevance-blind recency window. + expect(mockGetFaqChunksByIds).toHaveBeenCalledWith([30, 31, 32, 33]); + expect(mockGetFaqChunks).not.toHaveBeenCalled(); }); it("respects custom min_confidence in search mode", async () => { @@ -387,7 +400,7 @@ describe("knowledge tool search mode (with query)", () => { .mockResolvedValueOnce([makeChunkResult({ id: 10, similarity: 0.9 })]) .mockResolvedValueOnce([]); - mockGetFaqChunks.mockResolvedValueOnce([ + mockGetFaqChunksByIds.mockResolvedValueOnce([ makeFaqResult({ id: 10, confidence: 0.85, title: "Borderline" }), ]); diff --git a/src/__tests__/knowledge.test.ts b/src/__tests__/knowledge.test.ts new file mode 100644 index 0000000..4bf9c6a --- /dev/null +++ b/src/__tests__/knowledge.test.ts @@ -0,0 +1,378 @@ +import { + describe, + it, + expect, + vi, + beforeAll, + beforeEach, + afterAll, +} from "vitest"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; +import type { + KnowledgeToolConfig, + FaqChunkResult, + ChunkResult, +} from "../types.js"; + +// Mock the query layer. The search-mode path must look up FAQ metadata by the +// EXACT vector-result ids (getFaqChunksByIds), not by a relevance-blind +// indexed_at-ordered window. Search over-fetches candidates (effectiveLimit * 2) +// then resolves FAQ confidence/metadata by exact id, so a relevant hit can no +// longer be dropped just because its id falls outside a recency window. +vi.mock("../db/queries.js", () => ({ + getFaqChunks: vi.fn(), + getFaqChunksByIds: vi.fn(), + searchChunks: vi.fn(), +})); +vi.mock("../db/analytics.js", () => ({ + logQuery: vi.fn().mockResolvedValue(undefined), +})); +vi.mock("../config.js", () => ({ + getServerConfig: vi.fn().mockReturnValue({}), + getAnalyticsConfig: vi.fn().mockReturnValue(undefined), +})); + +import { registerKnowledgeTool } from "../mcp/tools/knowledge.js"; +import { + getFaqChunks, + getFaqChunksByIds, + searchChunks, +} from "../db/queries.js"; + +const mockGetFaqChunks = vi.mocked(getFaqChunks); +const mockGetFaqChunksByIds = vi.mocked(getFaqChunksByIds); +const mockSearchChunks = vi.mocked(searchChunks); +const mockEmbed = vi.fn(); + +function makeFaqResult( + overrides: Partial = {}, +): FaqChunkResult { + return { + id: 1, + source_name: "slack-support", + source_url: "https://slack.com/archives/C123/p456", + title: "How to configure headers?", + content: + "Q: How to configure headers?\n\nA: Use the headers property in the constructor.", + repo_url: null, + file_path: "C123:456:0", + start_line: null, + end_line: null, + language: null, + similarity: 0.0, + metadata: { channel: "C123", confidence: 0.85 }, + confidence: 0.85, + ...overrides, + }; +} + +function makeChunkResult(overrides: Partial = {}): ChunkResult { + return { + id: 1, + source_name: "slack-support", + source_url: "https://slack.com/archives/C123/p456", + title: "How to configure headers?", + content: "Q: How to configure headers?\n\nA: Use the headers property.", + repo_url: null, + file_path: "C123:456:0", + start_line: null, + end_line: null, + language: null, + similarity: 0.92, + ...overrides, + }; +} + +const toolConfig: KnowledgeToolConfig = { + name: "get-faq", + type: "knowledge", + description: "Get FAQ knowledge base entries.", + sources: ["slack-support", "discord-faq"], + min_confidence: 0.7, + default_limit: 2, + max_limit: 100, +}; + +// ── Fix 3: relevance-blind FAQ window drop ───────────────────────────────── + +describe("knowledge tool search mode — FAQ metadata fetched by result id", () => { + let client: Client; + let server: McpServer; + + beforeAll(async () => { + server = new McpServer({ name: "test-knowledge-byid", version: "1.0.0" }); + const embeddingClient = { embed: mockEmbed }; + registerKnowledgeTool( + server as never, + embeddingClient as never, + toolConfig, + ); + + const [clientTransport, serverTransport] = + InMemoryTransport.createLinkedPair(); + await server.connect(serverTransport); + client = new Client({ name: "test-client", version: "1.0.0" }); + await client.connect(clientTransport); + }); + + beforeEach(() => { + vi.clearAllMocks(); + }); + + afterAll(async () => { + await client.close(); + await server.close(); + }); + + it("returns a high-similarity hit even when it is NOT among the most-recently-indexed FAQ rows", async () => { + // limit defaults to 2. Top vector hit is id=9999 — a relevant chunk that a + // relevance-blind indexed_at-DESC window would drop. The current path + // resolves FAQ metadata by the EXACT result id instead, so the hit survives + // regardless of how recently it was indexed. + mockEmbed.mockResolvedValueOnce([0.1, 0.2, 0.3]); + mockSearchChunks + .mockResolvedValueOnce([makeChunkResult({ id: 9999, similarity: 0.97 })]) + .mockResolvedValueOnce([]); + + // The fix fetches metadata for EXACTLY the result ids. + mockGetFaqChunksByIds.mockResolvedValueOnce([ + makeFaqResult({ id: 9999, confidence: 0.9, title: "Relevant Old FAQ" }), + ]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "how to auth" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + expect(text).toContain("Relevant Old FAQ"); + + // It must look up by the exact result id, NOT use the windowed getFaqChunks. + expect(mockGetFaqChunksByIds).toHaveBeenCalledTimes(1); + const idsArg = mockGetFaqChunksByIds.mock.calls[0][0]; + expect(idsArg).toEqual([9999]); + // The relevance-blind windowed getFaqChunks must NOT be used in search mode. + expect(mockGetFaqChunks).not.toHaveBeenCalled(); + }); + + it("preserves the confidence filter when merging by id", async () => { + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks + .mockResolvedValueOnce([ + makeChunkResult({ id: 10, similarity: 0.95 }), + makeChunkResult({ id: 11, similarity: 0.9 }), + ]) + .mockResolvedValueOnce([]); + + mockGetFaqChunksByIds.mockResolvedValueOnce([ + makeFaqResult({ id: 10, confidence: 0.9, title: "High Confidence" }), + makeFaqResult({ id: 11, confidence: 0.3, title: "Low Confidence" }), + ]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "test" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + expect(text).toContain("High Confidence"); + expect(text).not.toContain("Low Confidence"); + }); + + it("returns no results when none of the result ids have FAQ metadata", async () => { + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks + .mockResolvedValueOnce([makeChunkResult({ id: 99, similarity: 0.95 })]) + .mockResolvedValueOnce([]); + mockGetFaqChunksByIds.mockResolvedValueOnce([]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "unmatched" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + expect(text).toBe("No FAQ results found."); + }); + + it("does not call getFaqChunksByIds when there are zero vector hits", async () => { + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks.mockResolvedValueOnce([]).mockResolvedValueOnce([]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "nothing" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + expect(text).toBe("No FAQ results found."); + // No ids to look up — skip the round-trip entirely. + expect(mockGetFaqChunksByIds).not.toHaveBeenCalled(); + }); + + // ── Fix: search-mode under-fill (slice-before-filter dropped below-confidence + // top-N hits with no backfill, returning fewer than `limit`) ──────────── + it("fills up to `limit` results by over-fetching when top-N hits are below confidence", async () => { + // limit defaults to 2. The two HIGHEST-similarity hits (ids 1, 2) are + // below the 0.7 confidence threshold; two MORE-confident hits (ids 3, 4) + // exist just past the top-2 window. The old code sliced to the top-2 + // (ids 1, 2) BEFORE applying the confidence filter, dropped both, and + // returned ZERO results. Over-fetching candidates first, filtering by + // confidence, THEN slicing to `limit` must return 2 results. + mockEmbed.mockResolvedValueOnce([0.1, 0.2, 0.3]); + mockSearchChunks + .mockResolvedValueOnce([ + makeChunkResult({ id: 1, similarity: 0.99 }), + makeChunkResult({ id: 2, similarity: 0.98 }), + makeChunkResult({ id: 3, similarity: 0.97 }), + makeChunkResult({ id: 4, similarity: 0.96 }), + ]) + .mockResolvedValueOnce([]); + + mockGetFaqChunksByIds.mockResolvedValueOnce([ + makeFaqResult({ id: 1, confidence: 0.2, title: "Low One" }), + makeFaqResult({ id: 2, confidence: 0.3, title: "Low Two" }), + makeFaqResult({ id: 3, confidence: 0.9, title: "Confident Three" }), + makeFaqResult({ id: 4, confidence: 0.85, title: "Confident Four" }), + ]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "fill me up" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + // Both confident-but-deeper hits must surface (2 == limit), not be dropped. + expect(text).toContain("Confident Three"); + expect(text).toContain("Confident Four"); + expect(text).not.toContain("Low One"); + expect(text).not.toContain("Low Two"); + // Exactly `limit` Q&A blocks rendered. + const qaBlocks = (text.match(/^Q&A \d+$/gm) ?? []).length; + expect(qaBlocks).toBe(2); + }); + + it("still caps the final result count at `limit` when many confident hits exist", async () => { + // All four hits are confident; the result must still be capped at limit=2. + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks + .mockResolvedValueOnce([ + makeChunkResult({ id: 1, similarity: 0.99 }), + makeChunkResult({ id: 2, similarity: 0.98 }), + makeChunkResult({ id: 3, similarity: 0.97 }), + makeChunkResult({ id: 4, similarity: 0.96 }), + ]) + .mockResolvedValueOnce([]); + + mockGetFaqChunksByIds.mockResolvedValueOnce([ + makeFaqResult({ id: 1, confidence: 0.9, title: "Top One" }), + makeFaqResult({ id: 2, confidence: 0.9, title: "Top Two" }), + makeFaqResult({ id: 3, confidence: 0.9, title: "Top Three" }), + makeFaqResult({ id: 4, confidence: 0.9, title: "Top Four" }), + ]); + + const result = await client.callTool({ + name: "get-faq", + arguments: { query: "cap me" }, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + const qaBlocks = (text.match(/^Q&A \d+$/gm) ?? []).length; + expect(qaBlocks).toBe(2); + // The two highest-similarity confident hits win the cap. + expect(text).toContain("Top One"); + expect(text).toContain("Top Two"); + }); +}); + +// ── Fix 4: extractAnswer fallback ────────────────────────────────────────── + +describe("knowledge tool — extractAnswer fallback handling", () => { + let client: Client; + let server: McpServer; + + beforeAll(async () => { + server = new McpServer({ name: "test-knowledge-answer", version: "1.0.0" }); + const embeddingClient = { embed: mockEmbed }; + registerKnowledgeTool( + server as never, + embeddingClient as never, + toolConfig, + ); + + const [clientTransport, serverTransport] = + InMemoryTransport.createLinkedPair(); + await server.connect(serverTransport); + client = new Client({ name: "test-client", version: "1.0.0" }); + await client.connect(clientTransport); + }); + + beforeEach(() => { + vi.clearAllMocks(); + }); + + afterAll(async () => { + await client.close(); + await server.close(); + }); + + it("handles a leading 'A:' answer with no preceding newline", async () => { + mockGetFaqChunks.mockResolvedValueOnce([ + makeFaqResult({ + // Content starts with the answer delimiter and no newline before it. + content: "A: The answer with no preceding Q line.", + title: "Direct answer", + confidence: 0.9, + }), + ]); + + const result = await client.callTool({ + name: "get-faq", + arguments: {}, + }); + + const text = (result.content as Array<{ type: string; text: string }>)[0] + .text; + expect(text).toContain("ANSWER: The answer with no preceding Q line."); + // The raw "A:" delimiter must not leak into the rendered answer. + expect(text).not.toContain("ANSWER: A:"); + }); + + it("logs a console.warn (with a chunk identifier) when the answer delimiter is absent (fallback)", async () => { + const debugSpy = vi.spyOn(console, "debug").mockImplementation(() => {}); + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); + + mockGetFaqChunks.mockResolvedValueOnce([ + makeFaqResult({ + content: "Q: Only a question, no answer delimiter at all?", + title: "No delimiter", + // No source_url, so the identifier falls back to file_path. + source_url: null, + file_path: "C123:456:0", + confidence: 0.9, + }), + ]); + + await client.callTool({ name: "get-faq", arguments: {} }); + + // The fallback path must emit a console.warn (raised from console.debug) so + // a malformed-content leak is visible at the default log level. + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('[knowledge] extractAnswer: no "A:" delimiter'), + ); + // The warning includes a chunk identifier so the offending row is locatable. + const warned = warnSpy.mock.calls.map((c) => String(c[0])).join("\n"); + expect(warned).toContain("C123:456:0"); + + debugSpy.mockRestore(); + warnSpy.mockRestore(); + }); +}); diff --git a/src/__tests__/markdown-chunker.test.ts b/src/__tests__/markdown-chunker.test.ts index 78bb95b..e04d503 100644 --- a/src/__tests__/markdown-chunker.test.ts +++ b/src/__tests__/markdown-chunker.test.ts @@ -1,4 +1,7 @@ -import { describe, it, expect } from "vitest"; +import { describe, it, expect, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; import { chunkMarkdown } from "../indexing/chunking/markdown.js"; import type { SourceConfig } from "../types.js"; @@ -66,12 +69,120 @@ describe("chunkMarkdown", () => { expect(chunks[0].title).toBe("My Heading"); }); + it('empty double-quoted frontmatter title (title: "") falls back to the heading', () => { + // The frontmatter regex's two independent `["']?` plus a `(.+?)` that needs + // ≥1 char turns `title: ""` into the stray quote `"`, which is truthy and + // defeats the heading/filename fallback. An empty title must be treated as + // ABSENT so the first heading wins. + const content = '---\ntitle: ""\n---\n\n# Real Heading\n\nBody text.'; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Real Heading"); + expect(chunks[0].title).not.toBe('"'); + }); + + it("empty single-quoted frontmatter title (title: '') falls back to the heading", () => { + const content = "---\ntitle: ''\n---\n\n# Real Heading\n\nBody text."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Real Heading"); + expect(chunks[0].title).not.toBe("'"); + }); + + it("bare empty frontmatter title (title:) falls back to the heading", () => { + const content = "---\ntitle:\n---\n\n# Real Heading\n\nBody text."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Real Heading"); + }); + + it("empty frontmatter title falls back to filename when no heading", () => { + const content = '---\ntitle: ""\n---\n\nJust prose, no heading at all.'; + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("guide.md"); + }); + + it("preserves internal quotes in a frontmatter title (no balanced wrap)", () => { + // `title: 5'6" tall` has internal quotes but is not wrapped in balanced + // surrounding quotes, so the value is kept literal (no edge-quote stripping). + const content = "---\ntitle: 5'6\" tall\n---\n\nBody text."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("5'6\" tall"); + }); + it("falls back to filename when no title or heading", () => { const content = "Just some plain text without any heading."; const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); expect(chunks[0].title).toBe("guide.md"); }); + it("does not adopt a TAB-indented `#` line as the title (CommonMark: tab = code)", () => { + // Per CommonMark a leading tab counts as 4 columns, so `\t# Heading` is an + // indented code line, NOT an ATX heading — it must never be adopted as the + // title (the title is embedded into the retrieval vector). The other heading + // detectors allow only 0-3 *spaces*; the title extractor must agree. The + // tab-indented line is placed AFTER a leading prose paragraph (not at the + // very start) so the document-leading-whitespace trim does not strip the tab + // before extraction — the regex itself must reject the tab indent. With no + // real heading anywhere, the title falls back to the filename. + const content = [ + "Intro prose with no heading at all here.", + "", + "\t# Tabbed line is code, not a heading", + "", + "More body text.", + ].join("\n"); + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("guide.md"); + expect(chunks[0].title).not.toBe("Tabbed line is code, not a heading"); + }); + + it("still adopts a 0-3-SPACE-indented `#` line as the title", () => { + // The complement of the tab case: 1-3 leading *spaces* before the hashes is + // a valid CommonMark ATX heading and must still be adopted as the title. + // Also prose-first so the space indent survives to the extractor and the + // 0-3-space path is genuinely exercised (not trimmed to column 0). + const content = [ + "Intro prose with no heading at all here.", + "", + " # Three Space Indented Heading", + "", + "Body under the heading.", + ].join("\n"); + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("Three Space Indented Heading"); + }); + + it("does not promote a TAB-indented `#` line at the DOCUMENT START to a heading", () => { + // Regression: stripMdx's final `.trim()` stripped the leading whitespace of + // the document's first content line BEFORE the spaces-only heading detector + // ran, so a doc that OPENS with `\t# X` (CommonMark indented code, NOT a + // heading) had its tab removed and was wrongly promoted to title/headingPath. + // The trim must strip only blank lines, preserving the first line's indent. + const content = "\t# NotAHeading\n\nbody text here."; + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("guide.md"); + expect(chunks[0].title).not.toBe("NotAHeading"); + expect(chunks[0].headingPath).toEqual([]); + }); + + it("does not promote a 4-space-indented `#` line at the DOCUMENT START to a heading", () => { + // 4 leading spaces = CommonMark indented code, not a heading — even when it + // is the document's first line (where the leading trim used to strip it). + const content = " # FourSpaces\n\nbody text here."; + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("guide.md"); + expect(chunks[0].title).not.toBe("FourSpaces"); + expect(chunks[0].headingPath).toEqual([]); + }); + + it("still adopts a 2-space-indented `#` line at the DOCUMENT START as a heading", () => { + // The valid complement: 0–3 leading spaces is a real ATX heading, including + // when it is the document's very first line. Preserving the first line's + // indent must not break this. + const content = " # TwoSpaces\n\nbody text here."; + const chunks = chunkMarkdown(content, "docs/guide.md", mkConfig()); + expect(chunks[0].title).toBe("TwoSpaces"); + expect(chunks[0].headingPath).toContain("TwoSpaces"); + }); + it("falls back to full path when filename extraction fails", () => { const content = "Plain text."; const chunks = chunkMarkdown(content, "noext", mkConfig()); @@ -120,6 +231,479 @@ describe("chunkMarkdown", () => { expect(chunks[0].content).not.toContain(" { + const content = "\nTabbed content here\n"; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).toContain("Tabbed content here"); + expect(chunks[0].content).not.toContain(" { + const content = 'Before\n\n\n\nAfter the motion.'; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).not.toContain(" { + const content = "Before\n\n\n\nAfter the foobar."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).not.toContain(" { + const content = "\nDollar inner content\n"; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).toContain("Dollar inner content"); + expect(chunks[0].content).not.toContain(" { + const content = [ + "## How to import", + "", + "```ts", + "import { Client } from '@my/sdk';", + "const c = new Client();", + "```", + "", + "Prose after.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + // The fenced import must survive verbatim, NOT be stripped as an MDX import. + expect(joined).toContain("import { Client } from '@my/sdk';"); + expect(joined).toContain("const c = new Client();"); + }); + + it("preserves JSX (with `>` in an attribute value) inside a fenced block", () => { + const content = [ + "## Render", + "", + "```tsx", + 'const el = hello;', + "```", + "", + "End.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain('const el = hello;'); + }); + + it("preserves JSX-looking content inside an inline code span", () => { + const content = + "## Inline\n\nUse `
x
` and `` in your text."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("`
x
`"); + expect(joined).toContain("``"); + }); + + it("does not treat a col-0 inline triple-backtick span as a code fence (drops later headings)", () => { + // CommonMark §4.5: a backtick code-fence info string may NOT contain a + // backtick. A col-0 line like ```js``` text is therefore an INLINE code + // span, not a fence opener. The buggy matchFenceOpen opened a phantom fence + // with no closing line, running it to EOF and masking every following + // heading as "code" — so `## Beta` was dropped from the heading path. Each + // section is padded past the target so the splitter cuts on the headings and + // `## Beta` opens its own chunk (its heading then enters that chunk's path). + const big = "Word ".repeat(120).trim(); + const content = [ + "## Alpha", + "", + big, + "", + "```js``` quick inline example, prose after.", + "", + "## Beta", + "", + big, + ].join("\n"); + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + // The later heading must survive (not be swallowed by a phantom fence). + expect(allHeadings.has("Beta")).toBe(true); + // The inline span line is preserved verbatim (not masked away as a fence). + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("```js``` quick inline example, prose after."); + // And no chunk may carry a half-open fence. Counting applies the CommonMark + // info-string rule (a backtick "fence" whose info string contains a backtick + // is an INLINE span, not a fence opener), so the inline ```js``` line is not + // miscounted as an unbalanced delimiter. + const realFenceCount = (text: string): number => { + let count = 0; + for (const line of text.split("\n")) { + const m = line.match(/^ {0,3}(`{3,}|~{3,})([^\n]*)$/); + if (m && !m[2].includes(m[1][0])) count++; + } + return count; + }; + for (const chunk of chunks) { + expect(realFenceCount(chunk.content) % 2).toBe(0); + } + }); + + it("does not mispair backtick delimiters around a stray backtick (destroys inline code)", () => { + // maskInlineCode must pair an opening run of N backticks with the next run + // of EXACTLY N backticks (CommonMark). A stray/odd backtick between two + // genuine single-backtick spans must not cause the second span's content to + // be left unmasked and then gutted by the JSX strip pass. Both `` and + // `` must survive verbatim. + const content = [ + "## H", + "", + "Use `` here.", + "", + "Then a stray ` backtick in prose.", + "", + "And `` there.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain(""); + expect(joined).toContain(""); + }); + + it("does not pair inline-code backticks across an ATX heading block boundary", () => { + // Per CommonMark, block parsing precedes inline parsing: an ATX heading is + // its own block and an inline code span cannot cross it. maskInlineCode runs + // AFTER maskHeadingLines (heading lines are already opaque sentinel tokens), + // so the closer-search must treat a heading-line boundary as a hard stop — + // mirroring the existing blank-line (paragraph break) guard. Otherwise the + // lone backtick on the `alpha` line wrongly pairs with the lone backtick on + // the `beta` line, forming a spurious code span ACROSS `## Middle Heading`; + // that span masks `` so it survives the JSX strip. Correctly, the + // `alpha`-line backtick has no valid same-block closer (the heading ends the + // block), so it is literal text and the prose JSX `` must be + // stripped. `` (after the heading) is on its own block and is + // always stripped. + const content = [ + "## H", + "", + "alpha ` one and tag", + "## Middle Heading", + "beta ` two and tag", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + // The intervening heading still binds as a heading line (masked verbatim, + // not swallowed into a spurious cross-heading code span). + expect(joined).toContain("## Middle Heading"); + // Prose JSX on the lines adjacent to the heading is stripped (the spurious + // cross-heading code span must not protect it). + expect(joined).not.toContain(""); + expect(joined).not.toContain(""); + }); + + it("preserves inline-code JSX across a soft line break with no intervening heading", () => { + // Control for the cross-heading hard-stop fix: the SAME shape WITHOUT an + // intervening heading legitimately forms ONE code span — a code span may + // contain a soft line break, so the two lone backticks pair across the + // newline and the span content (including its JSX) is correctly preserved. + // Only a HEADING-line boundary is a hard stop; a soft line break is not. + const content = [ + "## H", + "", + "alpha ` one and tag", + "beta ` two and tag", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + // The two lone backticks pair into a span; its inner JSX is preserved. + expect(joined).toContain(""); + // The trailing single backtick has no closer, so prose after it is stripped. + expect(joined).not.toContain(""); + }); + + it("does not let a side-effect import delete content up to the next import", () => { + // `import "./x.css";` (no `from`) must be removed on its own; the lazy + // `[\s\S]*?from` must NOT jump to the next import's `from`, deleting the + // heading + prose between the two imports. + const content = [ + 'import "./reset.css";', + "", + "## Setup", + "", + "Important prose between two imports that must survive.", + "", + "import { X } from '@/x';", + "", + "Trailing prose.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("Setup"); + expect(joined).toContain( + "Important prose between two imports that must survive.", + ); + expect(joined).toContain("Trailing prose."); + // Both import statements are gone. + expect(joined).not.toContain('import "./reset.css";'); + expect(joined).not.toContain("import { X } from '@/x';"); + }); + + it("does not delete a prose line that merely looks import-like", () => { + // An English sentence starting with "import" and containing " from " is + // prose, not an MDX import statement. + const content = + 'You can import a value from "the library" without ceremony, easily.'; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).toContain( + 'You can import a value from "the library" without ceremony, easily.', + ); + }); + + it("does not let a from-import strip span a blank line", () => { + // The from-import regex uses `\s+` around `from`, and `\s` matches newlines, + // so `import Config\n\nfrom "y";` is wrongly treated as ONE import statement + // and the whole `import Config\n\nfrom "y";` span (across the blank line) is + // deleted. The blank line means it is NOT a single import statement; the + // inter-token whitespace must forbid a newline so the dangling lines stay as + // literal text. + const content = [ + "First real paragraph that must survive.", + "", + "import Config", + "", + 'from "the-module";', + "", + "Second real paragraph that must survive.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("First real paragraph that must survive."); + expect(joined).toContain("Second real paragraph that must survive."); + // The dangling `import Config` / `from "the-module";` lines span a blank + // line, so they are NOT a single import statement and must NOT be stripped — + // the over-broad `\s+from` (with `\s` matching `\n`) deleted the whole span. + expect(joined).toContain("import Config"); + expect(joined).toContain('from "the-module";'); + }); + + it("still strips a normal single-line from-import", () => { + // Regression guard for the blank-line fix: a real single-line import must + // still be stripped. + const content = 'import X from "./y";\n\nProse after the import line.'; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).not.toContain('import X from "./y";'); + expect(joined).toContain("Prose after the import line."); + }); + + it("does not let an unclosed from-import brace swallow following headings/prose", () => { + // BUG B1: the named-import alternative `\{[^}]*\}` uses `[^}]`, which also + // matches a newline. An `import {` whose closing `}` is lines away greedily + // consumes the intervening ATX heading(s) and prose, deleting them silently + // (no warning). The brace content must be bounded to a single line so a + // dangling `{` cannot devour subsequent markdown lines. + const content = [ + "import {", + "## Eaten Heading", + "Eaten prose.", + '} from "@x";', + "", + "## Survivor", + "", + "Surviving prose.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + // The heading + prose that fell between the dangling `{` and its far-away + // `}` must SURVIVE in served content (they are markdown, not an import). + expect(joined).toContain("Eaten Heading"); + expect(joined).toContain("Eaten prose."); + // The "Survivor" heading must NOT become the document title: at the buggy + // HEAD the brace swallows everything up to `} from "@x";`, so the first + // surviving heading ("Survivor") is promoted to the title. + expect(chunks[0].title).not.toBe("Survivor"); + // The real survivor section is still present too. + expect(joined).toContain("Survivor"); + expect(joined).toContain("Surviving prose."); + }); + + it("strips a TypeScript `import type` declaration", () => { + // BUG B2: the from-import clause grammar lacks the TS `type` modifier, so + // `import type { Config } from "x"` (and `import type Foo` / `import type + // * as NS`) are not stripped and leak into served content. Add an optional + // `type` modifier after `import`. + const content = [ + 'import type { Config } from "@/types";', + 'import type Foo from "m";', + 'import type * as NS from "m";', + "", + "## Section", + "", + "Body prose that must survive.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).not.toContain("import type { Config }"); + expect(joined).not.toContain("import type Foo"); + expect(joined).not.toContain("import type * as NS"); + // The real content is unaffected. + expect(joined).toContain("Section"); + expect(joined).toContain("Body prose that must survive."); + }); + + it("strips a self-closing JSX tag whose attribute value contains `>`", () => { + // OUTSIDE a fence, a self-closing tag must still be stripped even when an + // attribute value contains `>` (the old `[^>]*` truncated at the inner `>`). + const content = 'Before\n\n\n\nAfter the callout.'; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).not.toContain("b"); + expect(chunks[0].content).toContain("Before"); + expect(chunks[0].content).toContain("After the callout"); + }); + + // ── MDX stripping: heading-awareness (component tags in headings) ──── + // + // The MDX JSX/import strip passes are GLOBAL regexes that historically ran over + // ATX heading lines too, deleting a ``/`..` that NAMES a + // component inside a heading. Since title, headingPath, and served content all + // derive from the post-strip body, the heading was corrupted everywhere. The + // strip must treat heading lines as protected regions (like fenced/inline code). + + it("preserves a self-closing component tag in the MIDDLE of an ATX heading", () => { + const content = "## The Provider\n\nBody text under it."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + // Title must retain the tag verbatim (no "The Provider" with the tag gone). + expect(chunks[0].title).toBe("The Provider"); + // headingPath entry likewise retains the tag. + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("The Provider")).toBe(true); + // Served content keeps the heading line intact. + expect(chunks[0].content).toContain("## The Provider"); + }); + + it("preserves a tag-only ATX heading (heading does not vanish)", () => { + const content = "## \n\nBody under the badge heading."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + // The whole heading is a component tag — it must NOT be deleted (which would + // make the title fall back to the filename and drop the heading entirely). + expect(chunks[0].title).toBe(""); + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("")).toBe(true); + expect(chunks[0].content).toContain("## "); + }); + + it("preserves a component tag at the END of an ATX heading", () => { + const content = "## Install \n\nBody under the heading."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].title).toBe("Install "); + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Install ")).toBe(true); + expect(chunks[0].content).toContain("## Install "); + }); + + it("preserves a paired component tag in an ATX heading verbatim", () => { + const content = "## Use x here\n\nBody under the heading."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + // A paired tag in a heading must not be reduced to its inner content. + expect(chunks[0].title).toBe("Use x here"); + expect(chunks[0].content).toContain("## Use x here"); + }); + + it("still strips a JSX tag in a PROSE line (heading-awareness is line-scoped)", () => { + // Regression guard: making the strip heading-aware must NOT stop it from + // stripping JSX on ordinary prose lines. + const content = "## Heading\n\nUse the in your prose here."; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + expect(chunks[0].content).not.toContain(" { + // Regression guard: heading-awareness must not regress fence/inline-span + // masking (the prior fixes for those must keep holding). + const content = [ + "## The Provider", + "", + "Inline `
x
` survives.", + "", + "```tsx", + "const el = ;", + "```", + "", + "Trailing prose with a tag.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("## The Provider"); + expect(joined).toContain("`
x
`"); + expect(joined).toContain("const el = ;"); + expect(joined).not.toContain(" { + const content = "## Heading ##\n\nBody text under the heading."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Heading"); + }); + + it("strips a single trailing `#` from the title", () => { + const content = "# Title #\n\nBody text under the heading."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Title"); + }); + + it("strips a docs `{#anchor}` from the title", () => { + const content = "## Config {#configuration}\n\nBody under the heading."; + const chunks = chunkMarkdown(content, "test.md", mkConfig()); + expect(chunks[0].title).toBe("Config"); + }); + + it("strips closing-`#`/anchor from headingPath entries too", () => { + // Force a split so a later chunk opens with each tricky heading and its + // headingPath is inspectable. + const big = "Word ".repeat(300).trim(); + const content = [ + "## Intro", + "", + big, + "", + "## Setup ##", + "", + big, + "", + "## Options {#opts}", + "", + big, + ].join("\n"); + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Setup")).toBe(true); + expect(allHeadings.has("Options")).toBe(true); + expect(allHeadings.has("Setup ##")).toBe(false); + expect(allHeadings.has("Options {#opts}")).toBe(false); + }); + // ── Basic chunking ────────────────────────────────────────────────── it("returns a single chunk for small content", () => { @@ -172,25 +756,267 @@ describe("chunkMarkdown", () => { expect(chunks.length).toBeGreaterThanOrEqual(2); }); + it("splits on a 1-3-space-indented ATX heading that is the ONLY boundary", () => { + // CommonMark allows 0-3 leading spaces before the hashes. The heading + // *detectors* (extractFirstHeading / getHeadingPathAtPosition) already honor + // that, but the heading *splitter* (splitOnHeading) historically anchored at + // column 0 only, so a 1-3-space-indented heading was NOT used as a section + // boundary even though it fed the headingPath — the detectors disagreed. + // + // This doc is constructed so the indented `## Section Two` heading is the + // SOLE available split boundary: there are no blank lines (so paragraph + // splitting yields a single part and cannot pre-empt the heading split), and + // each body is several short lines. With a column-0-only splitter the whole + // doc falls through to the line-split fallback, which never re-attaches a + // heading marker, so `## Section Two` ends up fused mid-chunk instead of + // opening its own chunk. An indent-aware splitter cuts cleanly at the + // heading. We assert via the chunk *boundary* (a chunk opens with the + // indented heading) — decoupled from the implementation regex. + const body1 = Array.from( + { length: 8 }, + (_, i) => `Alpha line ${i} content here.`, + ).join("\n"); + const body2 = Array.from( + { length: 8 }, + (_, i) => `Beta line ${i} content here.`, + ).join("\n"); + const content = [" ## Section One", body1, " ## Section Two", body2].join( + "\n", + ); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + + // The indented heading must create a clean section boundary: some chunk + // opens with it (leading indent is trimmed off the stored content). + const opensWithSectionTwo = chunks.some((c) => + c.content.startsWith("## Section Two"), + ); + expect(opensWithSectionTwo).toBe(true); + + // And both indented headings must still feed the heading path (detector + // consistency: split boundary AND headingPath agree). + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Section One")).toBe(true); + expect(allHeadings.has("Section Two")).toBe(true); + }); + // ── Heading path tracking ─────────────────────────────────────────── it("tracks heading path for chunks under h2", () => { - const content = "## Getting Started\n\nContent under getting started."; - const chunks = chunkMarkdown(content, "test.md", mkConfig()); - expect(chunks[0].headingPath).toBeDefined(); - // The heading path should include "Getting Started" - if (chunks[0].headingPath && chunks[0].headingPath.length > 0) { - expect(chunks[0].headingPath).toContain("Getting Started"); - } + // A large intro section precedes "## Getting Started", so the splitter emits + // the Getting Started section as its own chunk that OPENS with the heading + // (i.e. not at offset 0 of the document). That chunk must carry its own + // leading heading in its path — it is not enough for only the body chunk + // after the heading to be tagged. + const big = "Word ".repeat(300).trim(); + const content = `## Intro\n\n${big}\n\n## Getting Started\n\n${big}`; + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + const startedChunk = chunks.find((c) => + c.content.startsWith("## Getting Started"), + ); + expect(startedChunk).toBeDefined(); + expect(startedChunk!.headingPath).toContain("Getting Started"); }); it("tracks nested heading hierarchy", () => { - const body = "Content here. ".repeat(5); - const content = `## Parent\n\n### Child\n\n${body}`; + // Put a preceding section before the nested Parent/Child headings so the + // Child section lands in a chunk that does not begin at offset 0. The chunk + // that OPENS with "### Child" must carry the full hierarchy (both Parent and + // its own Child heading) — exercising inclusion of a chunk's own leading + // heading in addition to its ancestors. + const big = "Word ".repeat(300).trim(); + const content = `## Preamble\n\n${big}\n\n## Parent\n\n### Child\n\n${big}`; + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + const childChunk = chunks.find((c) => c.content.startsWith("### Child")); + expect(childChunk).toBeDefined(); + expect(childChunk!.headingPath).toContain("Parent"); + expect(childChunk!.headingPath).toContain("Child"); + }); + + it("does not treat a heading-like line inside a fenced code block as a real heading", () => { + // A `## Example` line lives INSIDE a fenced code block. Heading-path + // computation must skip fenced regions, so this fake heading must never + // appear in any chunk's headingPath (the headingPath is embedded into the + // retrieval vector, so a fake heading pollutes search). + const content = [ + "## Real Heading", + "", + "Some prose before the code block.", + "", + "```md", + "## Example", + "This line is documentation shown inside a code fence.", + "```", + "", + "Prose after the code block lives under Real Heading only.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.md", mkConfig()); - // At least one chunk should have heading path with Parent and Child - const lastChunk = chunks[chunks.length - 1]; - expect(lastChunk.headingPath).toBeDefined(); + for (const chunk of chunks) { + expect(chunk.headingPath).not.toContain("Example"); + } + // Sanity: the genuine heading is still tracked. + const anyHasReal = chunks.some((c) => + (c.headingPath ?? []).includes("Real Heading"), + ); + expect(anyHasReal).toBe(true); + }); + + it("does not treat a heading-like line inside a TILDE fenced code block as a real heading", () => { + // CommonMark/MDX allow `~~~` tilde fences in addition to backtick fences. + // A `## Example` line inside a `~~~` block is documentation, not a heading, + // so it must never enter any chunk's headingPath, and the tilde code block + // must not be split across chunks (it is a single atomic segment). + const filler = "Word ".repeat(120).trim(); + const content = [ + "## Real Heading", + "", + filler, + "", + "~~~md", + "## Example", + "This line is documentation shown inside a tilde code fence.", + "~~~", + "", + filler, + ].join("\n"); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 80, overlap_tokens: 0 }), + ); + + for (const chunk of chunks) { + expect(chunk.headingPath ?? []).not.toContain("Example"); + } + // The tilde code block stays intact in a single chunk (fence open + the + // documentation heading line + fence close together, never severed). + const intact = chunks.some( + (c) => + c.content.includes("~~~md") && + c.content.includes("## Example") && + c.content.includes("tilde code fence"), + ); + expect(intact).toBe(true); + // Sanity: the genuine heading is still tracked somewhere. + const anyHasReal = chunks.some((c) => + (c.headingPath ?? []).includes("Real Heading"), + ); + expect(anyHasReal).toBe(true); + }); + + it("does not inject a fake heading when a chunk boundary lands mid-fenced-block", () => { + // Build content so a chunk boundary (`position`) lands INSIDE a fenced code + // block that contains a `#`-prefixed line before that boundary. If heading + // detection re-segments a truncated slice that severs the fence, the + // unclosed fence is misread as text and the in-fence `#`-line is injected as + // a fake heading. Detection must segment the FULL content and filter by + // absolute offset so the fence stays closed and the fake heading is ignored. + const filler = "Word ".repeat(200).trim(); + const longCodeLine = "x = ".repeat(400).trim(); + const content = [ + "## Genuine", + "", + filler, + "", + "```python", + "# not-a-heading inside the fence", + longCodeLine, + "more code line one", + "more code line two", + "```", + "", + filler, + ].join("\n"); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: 0 }), + ); + + for (const chunk of chunks) { + expect(chunk.headingPath ?? []).not.toContain( + "not-a-heading inside the fence", + ); + } + }); + + it("includes a chunk's own leading heading in its heading path", () => { + // A chunk that begins with its own heading must include that heading in its + // headingPath. Build sections large enough to split so a later chunk opens + // with its own "## Beta" heading. + const para = "Word ".repeat(300).trim(); + const content = `## Alpha\n\n${para}\n\n## Beta\n\n${para}`; + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + // Find the chunk whose content opens with the Beta heading. + const betaChunk = chunks.find((c) => c.content.startsWith("## Beta")); + expect(betaChunk).toBeDefined(); + expect(betaChunk!.headingPath).toContain("Beta"); + }); + + it("binds the correct heading path when identical text repeats under different headings", () => { + // Two sections contain a byte-identical paragraph, but the second + // occurrence lives under an additional (deeper) heading. Heading-path + // assignment locates each chunk by searching for its text in the source; + // if the search cursor is not advanced past a matched chunk, the second + // occurrence re-finds the FIRST position and inherits the WRONG heading + // path. Since the heading path is embedded into the retrieval vector, a + // mis-bind degrades search — so this must resolve to the deeper section. + const repeated = "Install the package then configure the client object. " + .repeat(2) + .trim(); + const filler = "Another shared block of text that appears multiple times. " + .repeat(3) + .trim(); + const content = [ + "## Common", + "", + filler, + "", + "## Delta", + "", + repeated, // first occurrence: directly under Delta + "", + "#### Setup", + "", + filler, + "", + repeated, // second occurrence: under Delta -> Setup + ].join("\n"); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + + // The last chunk whose body is the repeated paragraph is the one that + // physically lives under "#### Setup", so its heading path must include + // BOTH "Delta" and "Setup" — not just "Delta" (the first occurrence's + // shallower path). + const repeatedChunks = chunks.filter((c) => c.content.trim() === repeated); + expect(repeatedChunks.length).toBeGreaterThanOrEqual(1); + const setupChunk = repeatedChunks[repeatedChunks.length - 1]; + expect(setupChunk.headingPath).toContain("Delta"); + expect(setupChunk.headingPath).toContain("Setup"); }); // ── Code block preservation ───────────────────────────────────────── @@ -218,23 +1044,44 @@ describe("chunkMarkdown", () => { // ── Overlap ───────────────────────────────────────────────────────── - it("applies overlap between chunks", () => { + it("applies overlap without corrupting line boundaries of chunk content", () => { + // Each section ends in a long single line of words (no newline in the last + // `overlapChars`). With a naive overlap that prepends the raw partial tail + // with no separator, the previous tail jams directly onto the next chunk's + // leading content — e.g. "## Section 0" + "Word..." becomes + // "## Section 0Word..." and a body tail fuses onto the next heading as + // "...Word## Section 1". Both push a heading off its own line and fuse + // words. The overlap must drop the partial leading line and join with a + // separator so every heading stays at line-start. const sections = Array.from( { length: 5 }, - (_, i) => `## Section ${i}\n\n${"Word ".repeat(200)}`, + (_, i) => `## Section ${i}\n\n${"Word ".repeat(200).trim()}`, ).join("\n\n"); const chunks = chunkMarkdown( sections, "test.md", mkConfig({ target_tokens: 100, overlap_tokens: 20 }), ); - if (chunks.length >= 2) { - // Second chunk should contain some text from the end of the first chunk - // (overlap means shared content) - const firstEnd = chunks[0].content.slice(-50); - // At least some portion should appear in chunk 1 - // This is a loose check since overlap is character-based and may break at word boundaries - expect(chunks[1].content.length).toBeGreaterThan(0); + expect(chunks.length).toBeGreaterThanOrEqual(2); + + // Across ALL chunks, every "## Section " heading must occupy its own + // physical line: nothing fused before "##" and only the heading text after + // it. This catches both the head-fusion ("## Section 0Word") and the + // tail-fusion ("Word## Section 1") signatures of the overlap bug. + for (const chunk of chunks) { + // No non-whitespace, non-`#` character may immediately precede an ATX + // heading marker (`##`..`######` + space). The `[^\n#]` guard is what + // keeps a legitimate deeper heading (e.g. an h3 `### Foo`, where the char + // before the final `##…` IS a `#`) from being misread as tail-fusion — + // the old `/\S#{2}\s/` matched ANY two `#`, so `### Foo` would falsely + // trip it. This scans for the real tail-fusion signature ("Word## …"). + expect(chunk.content).not.toMatch(/^[^\n#]+#{2,6}\s/m); + // A heading line must be exactly "## Section " with nothing fused after + // the number (the next char is end-of-line or end-of-string). + const headingLines = chunk.content.match(/^#{2,}.*$/gm) ?? []; + for (const line of headingLines) { + expect(line).toMatch(/^#{2,6} Section \d+$/); + } } }); @@ -251,6 +1098,199 @@ describe("chunkMarkdown", () => { expect(chunks.length).toBeGreaterThan(1); }); + it("ACTUALLY applies overlap for single-line prose paragraphs", () => { + // The dominant markdown content shape is a paragraph that is ONE physical + // line (no embedded newline). The buggy applyOverlap took the last `\n` in + // the overlap window and, finding none in a single-line chunk, dropped the + // overlap entirely — making overlap a NO-OP for the most common content. A + // word-boundary tail of the previous chunk must actually be prepended. + const paragraphs = Array.from( + { length: 12 }, + (_, i) => + `Paragraph ${i} ${`distinctword${i}word `.repeat(40).trim()} endmarker${i}`, + ).join("\n\n"); + const chunks = chunkMarkdown( + paragraphs, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: 20 }), + ); + expect(chunks.length).toBeGreaterThan(2); + // At least one chunk[i] (i>=1) must contain a trailing fragment of the + // previous chunk's content — the `endmarker` token from the chunk that + // ended before it. This is the coverage that was missing and hid the bug. + let foundCarriedTail = false; + for (let i = 1; i < chunks.length; i++) { + const prev = chunks[i - 1].content; + const prevMarker = prev.match(/endmarker\d+/g)?.pop(); + if (prevMarker && chunks[i].content.includes(prevMarker)) { + // The marker from the previous chunk's tail leaked into this chunk — + // overlap was actually applied. + foundCarriedTail = true; + break; + } + } + expect(foundCarriedTail).toBe(true); + }); + + it("overlap size roughly scales with overlap_tokens for single-line prose", () => { + const paragraphs = Array.from( + { length: 12 }, + (_, i) => `Paragraph ${i} ${"wordy ".repeat(60).trim()}`, + ).join("\n\n"); + const small = chunkMarkdown( + paragraphs, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: 5 }), + ); + const large = chunkMarkdown( + paragraphs, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: 40 }), + ); + // Larger overlap_tokens ⇒ more total bytes across chunks (the overlapped + // tails are bigger). With zero overlap applied (the bug), both would be + // byte-identical and this sum comparison would be equal. + const totalSmall = small.reduce((n, c) => n + c.content.length, 0); + const totalLarge = large.reduce((n, c) => n + c.content.length, 0); + expect(totalLarge).toBeGreaterThan(totalSmall); + }); + + it("overlap on single-line prose never exceeds the requested window", () => { + // The prepended tail must roughly honor overlapChars (overlap_tokens * 4) — + // it must not dump the entire previous chunk. We allow generous slack for + // the word-boundary snap and the "\n\n" separator. + const overlapTokens = 10; + const paragraphs = Array.from( + { length: 10 }, + (_, i) => `Para ${i} ${"token ".repeat(80).trim()}`, + ).join("\n\n"); + const chunks = chunkMarkdown( + paragraphs, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: overlapTokens }), + ); + const overlapChars = overlapTokens * 4; + // For each chunk[i>=1], the leading prepended fragment (before the "\n\n" + // that separates it from the chunk's real content) must be ≤ overlapChars + // plus modest slack for word-boundary snapping. + for (let i = 1; i < chunks.length; i++) { + const content = chunks[i].content; + const sep = content.indexOf("\n\n"); + if (sep === -1) continue; // no overlap prepended on this boundary + const lead = content.slice(0, sep); + // The prepended overlap must not blow past the requested window. + expect(lead.length).toBeLessThanOrEqual(overlapChars + 20); + } + }); + + // The tail's START must snap to a word boundary so the embedded/served text + // does not begin in the middle of a word. The source is built from a VARIED- + // LENGTH vocabulary so an arbitrary overlapChars window almost never lands on + // a word boundary by accident — a uniform "alphabeta " filler aligns to a + // boundary at certain overlap_tokens and hides a dead word-boundary snap. We + // parametrize over several overlap_tokens (incl. the production default 50) so + // a genuine mid-word cut is exercised. The FIRST whitespace-delimited token of + // every prepended overlap must be a COMPLETE word from the vocabulary — never + // a suffix like "enta" cut from the middle of "documentation". + for (const overlapTokens of [16, 18, 24, 50]) { + it(`overlap does not start mid-word for single-line prose (overlap_tokens=${overlapTokens})`, () => { + const vocab = new Set([ + "Para", + "the", + "quick", + "brown", + "fox", + "jumps", + "over", + "lazy", + "dog", + "documentation", + "configuration", + "alphabeta", + "x", + ]); + const words = [ + "the", + "quick", + "brown", + "fox", + "jumps", + "over", + "the", + "lazy", + "dog", + "documentation", + "x", + "configuration", + "alphabeta", + ]; + const paragraphs = Array.from({ length: 8 }, (_, i) => { + const body = Array.from( + { length: 90 }, + (_, j) => words[(i + j) % words.length], + ).join(" "); + return `Para ${i} ${body}`; + }).join("\n\n"); + const chunks = chunkMarkdown( + paragraphs, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: overlapTokens }), + ); + for (let i = 1; i < chunks.length; i++) { + const content = chunks[i].content; + const sep = content.indexOf("\n\n"); + if (sep === -1) continue; + const lead = content.slice(0, sep).trim(); + if (lead === "") continue; + const firstToken = lead.split(/\s+/)[0]; + // A purely numeric token (the paragraph index) is also a whole word. + const isWholeWord = vocab.has(firstToken) || /^\d+$/.test(firstToken); + expect( + isWholeWord, + `overlap lead began mid-word: "${firstToken}"`, + ).toBe(true); + } + }); + } + + it("strips literal PUA sentinels from input so they cannot corrupt chunks", () => { + // The chunker masks inline-code spans with U+E000/U+E001 and heading lines + // with U+E002/U+E003. A hostile/exotic source that contains those literal + // code points — especially in a sentinel SHAPE like `0` — would + // collide with the placeholder namespace: the heading-restore pass would + // rewrite the prose sequence into heading index 0's text. chunkMarkdown must + // strip all four code points up front so no sentinel survives into a served + // chunk and no collision can occur. + const OPEN_H = String.fromCharCode(0xe002); + const CLOSE_H = String.fromCharCode(0xe003); + const OPEN_C = String.fromCharCode(0xe000); + const CLOSE_C = String.fromCharCode(0xe001); + const doc = [ + "# Real Heading", + "", + // A literal heading-sentinel SHAPE pointing at index 0 — the exact + // collision the input strip must neutralize. + `Prose with a literal ${OPEN_H}0${CLOSE_H} sentinel and a ${OPEN_C}1${CLOSE_C} code sentinel.`, + ].join("\n"); + const chunks = chunkMarkdown( + doc, + "test.md", + mkConfig({ target_tokens: 60, overlap_tokens: 0 }), + ); + expect(chunks.length).toBeGreaterThan(0); + for (const chunk of chunks) { + // No PUA sentinel may survive into the served chunk text. + expect(/[\u{E000}-\u{E003}]/u.test(chunk.content)).toBe(false); + // The prose must NOT have been rewritten into the real heading's text. + if (chunk.content.includes("Prose with a literal")) { + expect(chunk.content).not.toContain("Real Heading sentinel"); + } + } + // The genuine heading still binds correctly (the strip only removed the + // literal sentinels, not the real `#` heading). + expect(chunks[0].headingPath).toContain("Real Heading"); + }); + // ── Chunk config parameters ───────────────────────────────────────── it("uses default target_tokens when not specified", () => { @@ -319,6 +1359,51 @@ describe("chunkMarkdown", () => { expect(chunks[0].content).toContain("Body text"); }); + it("detects headings and fences in a CRLF-authored body (no frontmatter)", () => { + // CRITICAL: the single-line predicates use `$`/`.` (which do not match `\r`) + // and the chunker splits on `\n` only, leaving a trailing `\r` on every + // line. Without normalization a CRLF doc gets title=filename, headingPath=[] + // on every chunk, and a code fence runs to EOF. chunkMarkdown must normalize + // `\r\n` → `\n` once up front so heading + fence detection work. Each section + // is padded past the target so the splitter cuts on the headings and the + // post-fence `## Second` opens its own chunk (its heading then enters that + // chunk's path) — proving the CRLF fence closed rather than running to EOF. + const big = "Word ".repeat(120).trim(); + const content = [ + "## Title", + "", + big, + "", + "```js", + "const x = 1;", + "```", + "", + "## Second", + "", + big, + ].join("\r\n"); + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + // (a) Title comes from the first heading, not the filename. + expect(chunks[0].title).toBe("Title"); + // (b) headingPath is populated (not the degraded []). + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Title")).toBe(true); + // (c) The CRLF fence is closed/atomic — the following heading surfaces as + // its own boundary instead of being collapsed into the fence-to-EOF. + expect(allHeadings.has("Second")).toBe(true); + // No chunk carries a half-open fence. + for (const chunk of chunks) { + const fenceDelims = ( + chunk.content.match(/^ {0,3}(?:`{3,}|~{3,})/gm) ?? [] + ).length; + expect(fenceDelims % 2).toBe(0); + } + }); + // ── Line splitting fallback ───────────────────────────────────────── it("falls back to line splitting for very long paragraphs", () => { @@ -335,6 +1420,42 @@ describe("chunkMarkdown", () => { expect(chunks.length).toBeGreaterThan(1); }); + it("preserves single-newline structure and heading path in the line-split fallback", () => { + // A single heading-tagged paragraph with NO blank-line boundaries that is + // long enough (well over targetChars) to force the line-split fallback. + // The fallback must NOT double the source single newlines (content fidelity + // — embeddings/snippets must match source), and because the produced chunk + // text must remain a verbatim substring of the source, indexOf still binds + // the heading path (otherwise it degrades to []). + const body = Array.from({ length: 200 }, (_, i) => `Line ${i} text.`).join( + "\n", + ); + const content = `## Long Section\n\n${body}`; + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + + // Must have actually split into multiple chunks via the line-split path. + expect(chunks.length).toBeGreaterThan(1); + + // (a) Content fidelity: adjacent source lines stay single-newline-joined in + // the produced chunk text — never blank-line ("\n\n") separated, which + // is the newline-doubling signature. + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain("Line 1 text.\nLine 2 text."); + expect(joined).not.toContain("Line 1 text.\n\nLine 2 text."); + + // (b) Heading retention: the line-split body chunks must keep their heading + // path (the section heading), not degrade to []. + const bodyChunks = chunks.filter((c) => /Line \d+ text\./.test(c.content)); + expect(bodyChunks.length).toBeGreaterThan(0); + for (const chunk of bodyChunks) { + expect(chunk.headingPath).toContain("Long Section"); + } + }); + // ── Very long single line ─────────────────────────────────────────── it("handles a very long single line", () => { @@ -365,3 +1486,1633 @@ describe("chunkMarkdown", () => { expect(chunks).toEqual([]); }); }); + +// ── Heading-path detection precision ────────────────────────────────────── +// +// The chunker embeds headingPath into the retrieval vector, so a missed or +// fabricated heading directly degrades search. These pin the `\s+`-after-hashes +// imprecision in getHeadingPathAtPosition: `\s` includes `\n`, so a `##` line +// with no inline text skips the newline and captures the NEXT line as the +// heading text, injecting a fake heading into the path. + +describe("chunkMarkdown heading-path precision", () => { + // Force the doc to split so a chunk STARTS after the heading-of-interest: a + // chunk's headingPath only includes a heading it opens with or that precedes + // it. We pad each section past targetChars so the next `##` heading begins a + // fresh chunk whose headingPath we can inspect. + const PAD = "Filler sentence. ".repeat(120); // > 600-char default split size + + it("does not inject the next line as a heading for a bare `##` line", () => { + // A line that is only `##` (no inline text). `\s+` would skip the newline + // and capture the FOLLOWING body line as the heading text; the `[ \t]+\S` + // requirement rejects it. + const content = [ + "## Real Heading", + "", + PAD, + "", + "##", // bare hashes, no text + "Injected body line should NOT become a heading.", + "", + PAD, + "", + "## Tail Heading", + "", + "Tail body.", + ].join("\n"); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + + // No chunk's headingPath may contain the body line that followed the bare + // `##` (the fake-heading injection signature). + for (const chunk of chunks) { + expect(chunk.headingPath).not.toContain( + "Injected body line should NOT become a heading.", + ); + } + // Sanity: the real headings around it are still captured somewhere. + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Real Heading")).toBe(true); + expect(allHeadings.has("Tail Heading")).toBe(true); + }); + + it("does not treat a `##` line with only trailing spaces as a heading", () => { + // `## ` followed by only spaces/tabs is whitespace-only heading text — not a + // real heading. The `\S.*` requirement (non-space first char) excludes it. + const content = [ + "## Anchor Heading", + "", + PAD, + "", + "## ", // hashes + only trailing spaces + "Spaces-only-heading body line.", + "", + PAD, + "", + "## Closing Heading", + "", + "Closing body.", + ].join("\n"); + + const chunks = chunkMarkdown( + content, + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + ); + + // The whitespace-only heading text must never appear (empty string or the + // following body line) in any headingPath. + for (const chunk of chunks) { + expect(chunk.headingPath).not.toContain(""); + expect(chunk.headingPath).not.toContain("Spaces-only-heading body line."); + } + }); +}); + +// ── CHUNKER STRUCTURAL INVARIANTS (the convergence lever) ───────────────── +// +// The chunker has multiple split + heading-extraction paths (splitOnHeading, +// splitPreservingCodeBlocks, recursiveSplit's line-split fallback, +// getHeadingPathAtPosition, extractFirstHeading) that each enforce its +// invariants. Prior rounds patched one path at a time and the next round found +// another. Rather than pin one point, this table-driven property test runs +// chunkMarkdown over a corpus of varied markdown and asserts, for EVERY produced +// chunk, the WHOLE class of invariants with an INDEPENDENT oracle (the oracle +// re-derives heading/fence facts directly, never via the production functions): +// +// (I1) FENCE INTEGRITY: no chunk contains an unbalanced/half-open code fence, +// and no fenced code block (``` or ~~~) is split across chunk boundaries. +// (I2) VERBATIM FIDELITY: each chunk's raw (pre-overlap) text is a verbatim +// substring of the cleaned body, so no "heading-path lookup failed" +// degradation warning ever fires. +// (I3) HEADING SOUNDNESS + COMPLETENESS: every headingPath entry is a real ATX +// heading (CommonMark: 0–3 leading spaces, 1–6 `#`, space/tab, non-space +// text) located OUTSIDE fenced code; and every real heading that owns +// indexable body is captured by at least one chunk. +// (I4) TITLE SOUNDNESS: the derived title is a real ATX heading outside fences +// (never a `#`-comment inside a fence), or the filename fallback. +// +// headingPath and title are embedded into the retrieval vector, and a severed +// fence corrupts stored chunk text, so each invariant directly guards search +// quality. Setext headings (`Title\n====`) are a KNOWN unsupported limitation +// and are intentionally NOT asserted here (out of scope). +describe("chunkMarkdown structural invariants", () => { + // Independent oracle helpers re-derive CommonMark heading/fence facts + // line-by-line. They MUST NOT import or reuse the module's internal predicate + // (that would re-couple oracle and production and let production bugs hide). + // The oracle encodes CORRECT CommonMark so it DISAGREES with buggy production + // (RED) and AGREES with fixed production (GREEN). + + // An OPENING fence is 0–3 leading spaces, then a run of ≥3 backticks or ≥3 + // tildes (CommonMark allows the same 0–3-space indent as a heading; column 0 + // is not required). Returns the fence char + run length, or null. + // + // CommonMark §4.5: the info string after a BACKTICK fence run may NOT contain + // a backtick (otherwise the line is an inline code span, not a fence opener); + // a TILDE fence's info string may contain backticks but not tildes. The oracle + // encodes this so it does not share production's historical blind spot of + // opening a phantom fence on a col-0 inline ```lang``` text span. + function matchFenceOpen(line: string): { char: string; len: number } | null { + const m = line.match(/^ {0,3}(`{3,}|~{3,})([^\n]*)$/); + if (!m) return null; + const char = m[1][0]; + const info = m[2]; + if (info.includes(char)) return null; + return { char, len: m[1].length }; + } + + // A CLOSING fence is the SAME fence char, 0–3-space indent, a run of LENGTH + // ≥ the opener (CommonMark permits a longer closing fence), and only trailing + // spaces/tabs after the run (no info string on a closing fence). + function isFenceClose( + line: string, + open: { char: string; len: number }, + ): boolean { + const fenceChar = open.char === "`" ? "`" : "~"; + const re = new RegExp(`^ {0,3}(\\${fenceChar}{${open.len},})[ \\t]*$`); + return re.test(line); + } + + // Strip a CommonMark trailing closing-`#` sequence and a Docusaurus/Nextra + // `{#anchor}` from captured heading text, mirroring the production predicate + // but RE-DERIVED here (not imported). `## H ##` → "H"; `## C {#cfg}` → "C". + // The closing `#`-run must be preceded by whitespace (or be the whole text); + // `foo###` (no preceding space) keeps its hashes per CommonMark. + function normalizeHeadingText(text: string): string { + let t = text.trim(); + // Order matters and MUST match production stripHeadingText: strip the + // trailing closing-`#` sequence FIRST, then the `{#anchor}`. Reversing the + // order disagrees on `## X {#a} ##` (anchor-first leaves "X {#a}", hash-first + // yields "X"), which would make I3-soundness falsely fail against correct + // production output. + // Trailing closing-`#` sequence, only when preceded by a space/tab. Uses + // `[ \t]` (NOT `\s`) to mirror production stripHeadingText EXACTLY — a `\s` + // class would also match exotic whitespace (\f, \v, unicode spaces) before a + // closing `#`/{#anchor}, diverging from production on those inputs. + t = t.replace(/(^|[ \t])#+[ \t]*$/, "$1").trimEnd(); + // Trailing {#anchor} (optionally followed by spaces/tabs) — docs convention. + t = t.replace(/[ \t]*\{#[^}]*\}[ \t]*$/, "").trimEnd(); + return t; + } + + // Independent oracle: compute the set of REAL ATX heading texts in a document, + // masking fenced code blocks. A heading is a 0–3-space-indented line of 1–6 + // `#`, then ≥1 space/tab, then non-space text (4+ leading spaces is a code + // line, not a heading). An UNCLOSED opening fence runs to END OF INPUT, so a + // `#`-line after it is NOT a heading. Captured text has its closing-`#` run + // and `{#anchor}` stripped. Drives both the I3 heading check and I4 title. + function realHeadings(doc: string): Set { + const out = new Set(); + let fence: { char: string; len: number } | null = null; + for (const line of doc.split("\n")) { + if (fence) { + // Inside a fence: only a same-char run of length ≥ opener closes it. + if (isFenceClose(line, fence)) fence = null; + continue; + } + const open = matchFenceOpen(line); + if (open) { + fence = open; + continue; + } + const m = line.match(/^ {0,3}(#{1,6})[ \t]+(\S.*?)\s*$/); + if (m) { + const text = normalizeHeadingText(m[2]); + if (text) out.add(text); + } + } + return out; + } + + // Independent oracle: is the fence state balanced (every opened fence closed) + // at the END of `text`? A chunk whose code fence is half-open means a fenced + // block was severed across the chunk boundary. Re-derived line-by-line, not + // via segmentCodeBlocks; honors 0–3-space-indented fences and ≥-length close. + function fenceBalanced(text: string): boolean { + let fence: { char: string; len: number } | null = null; + for (const line of text.split("\n")) { + if (fence) { + if (isFenceClose(line, fence)) fence = null; + } else { + const open = matchFenceOpen(line); + if (open) fence = open; + } + } + return fence === null; + } + + // Independent oracle: extract every fenced code block's INTERIOR content + // (the lines BETWEEN the opening and closing fence, joined by "\n"), masking + // by the same CommonMark fence rules. Used by I6 to assert fenced content + // survives stripMdx/chunking verbatim. An unclosed fence runs to EOF. + function fencedInteriors(doc: string): string[] { + const blocks: string[] = []; + let fence: { char: string; len: number } | null = null; + let body: string[] = []; + for (const line of doc.split("\n")) { + if (fence) { + if (isFenceClose(line, fence)) { + blocks.push(body.join("\n")); + fence = null; + body = []; + } else { + body.push(line); + } + } else { + const open = matchFenceOpen(line); + if (open) { + fence = open; + body = []; + } + } + } + // Unclosed fence: its accumulated interior still counts (runs to EOF). + if (fence) blocks.push(body.join("\n")); + return blocks; + } + + interface Case { + name: string; + doc: string; + // Real headings that MUST be captured (have body after them). Optional — + // omit for cases that only assert "no fake heading / fence break leaks in". + mustCapture?: string[]; + // Headings that MUST OPEN a chunk (I5 split-boundary completeness). Only the + // levels recursiveSplit cuts on (h2/h3) are split boundaries, and only when + // the section is large enough to force a split — so this is an explicit, + // per-case opt-in (NOT "every heading"): an h4+ heading or a small section + // legitimately stays fused. Populated for the separator-bug cases (TAB / + // indent) where a boundary that production wrongly dropped must reappear. + mustOpenChunk?: string[]; + // Substrings that MUST survive verbatim in some chunk (I6 reinforcement for + // fenced code / inline spans that stripMdx must not gut). Beyond the generic + // fencedInteriors() I6 check, these pin specific tokens (imports, JSX) that + // the over-broad strip passes historically destroyed. + mustPreserveVerbatim?: string[]; + // Substrings that MUST NOT be a heading-path entry / title in any chunk + // (fence-interior `#`-lines, prose that looked import-like, etc.). + mustNotBeHeading?: string[]; + // Expected derived title (frontmatter-free), when the case pins it (e.g. + // closing-`#`/anchor stripping). Omit to use the generic I4 soundness check. + expectTitle?: string; + } + + const PAD = "Filler sentence. ".repeat(120); // forces a split per section + // An oversized (> 2400-char) GAPLESS fenced block (no blank lines inside) that + // also contains a `#`-comment line — the F1 line-split-fallback bug shreds + // this across chunks (severing the fence, collapsing internal newlines) unless + // the fallback treats the code block as one atomic unit. + const OVERSIZED_GAPLESS_CODE = [ + "```python", + "# configure_client is a comment, not a heading", + Array.from( + { length: 120 }, + (_, i) => `setting_${i} = value_${i} * ${i}`, + ).join("\n"), + "```", + ].join("\n"); + + const cases: Case[] = [ + { + name: "oversized gapless fenced block with an inner `#`-comment line", + doc: [ + "## Sigma", + "", + OVERSIZED_GAPLESS_CODE, + "", + "Body after the block.", + ].join("\n"), + mustCapture: ["Sigma"], + }, + { + name: "doc STARTS with a fenced block whose first line is `# something`", + doc: [ + "```md", + "# something that only looks like a heading", + "more example markdown", + "```", + "", + "Prose body following the leading fenced block.", + ].join("\n"), + // No real heading owns body here — the only `#`-line is inside the fence, + // so the title must fall back to the filename (I4), not the in-fence line. + }, + { + name: "`#`-line inside a tilde (~~~) fence is not a heading", + doc: [ + "## Eta", + "", + PAD, + "", + "~~~", + "# Tilde-fenced comment, not a heading", + "~~~", + "", + "Body after the tilde block.", + ].join("\n"), + mustCapture: ["Eta"], + }, + { + name: "1-3-space-indented ATX heading is captured", + doc: [ + " ### Indented Three Spaces", + "", + "Body under the indented heading here.", + ].join("\n"), + mustCapture: ["Indented Three Spaces"], + }, + { + name: "indented headings deep enough to force a split are all captured", + doc: [ + " ## Two Space Heading", + "", + PAD, + "", + " ## Another Indented Heading", + "", + PAD, + ].join("\n"), + mustCapture: ["Two Space Heading", "Another Indented Heading"], + mustOpenChunk: ["Another Indented Heading"], + }, + { + name: "bare `##`-only line does not inject the next line", + doc: [ + "## Alpha", + "", + PAD, + "", + "##", + "Body after bare hashes.", + "", + PAD, + "", + "## Beta", + "", + "Beta body.", + ].join("\n"), + mustCapture: ["Alpha", "Beta"], + }, + { + name: "`##` with only trailing spaces is not a heading", + doc: [ + "## Gamma", + "", + PAD, + "", + "## ", + "Body after spaces-only hashes.", + "", + PAD, + "", + "## Delta", + "", + "Delta body.", + ].join("\n"), + mustCapture: ["Gamma", "Delta"], + }, + { + name: "`## Heading` directly after a closing backtick fence (single newline)", + doc: [ + "## Epsilon", + "", + PAD, + "", + "```js", + "const x = 1;", + "```", + "## After Backtick Fence", + "", + "Body under the post-fence heading.", + ].join("\n"), + mustCapture: ["Epsilon", "After Backtick Fence"], + }, + { + // The Bug-1 repro shape: a fenced code block that ENDS a section, then a + // blank line, then the next heading. With the production-default overlap + // (overlap_tokens > 0), the section's chunk ENDS with the lone closing + // fence delimiter line, and applyOverlap prepends that delimiter to the + // FOLLOWING chunk — opening it with a fence that never closes (half-open). + // Under overlap_tokens: 0 this case is benign; the parametrized run at the + // production default (50) is what makes it bite. The filler is sized so the + // fence lands at a chunk boundary at target_tokens: 100. + name: "fenced block ENDS a section, immediately followed by a heading (overlap repro)", + doc: [ + "## Kappa", + "", + "Word ".repeat(100).trim(), + "", + "```js", + "const a = 1;", + "const b = 2;", + "const c = 3;", + "```", + "", + "## Lambda", + "", + "Word ".repeat(60).trim(), + ].join("\n"), + mustCapture: ["Kappa", "Lambda"], + }, + { + name: "`#`-line inside a backtick fence is not a heading", + doc: [ + "## Zeta", + "", + PAD, + "", + "```sh", + "# This is a shell comment, not a heading", + "## Neither is this", + "```", + "", + "Body after the fenced block.", + ].join("\n"), + mustCapture: ["Zeta"], + }, + { + name: "4-space-indented `#`-line is not a heading", + doc: [ + "## Theta", + "", + PAD, + "", + " # Indented four spaces — a code line, not a heading", + "", + "Body after the indented line.", + ].join("\n"), + mustCapture: ["Theta"], + }, + { + name: "deeply nested `#`/`##`/`###` hierarchy", + doc: [ + "# Top", + "", + PAD, + "", + "## Middle", + "", + PAD, + "", + "### Leaf", + "", + "Leaf body content here.", + ].join("\n"), + mustCapture: ["Top", "Middle", "Leaf"], + }, + { + name: "duplicate body text under different headings binds the right path", + doc: [ + "## Common", + "", + "A shared block of prose appearing more than once. ".repeat(3).trim(), + "", + "## Outer", + "", + "Repeated paragraph bound to its own section. ".repeat(2).trim(), + "", + "#### Inner", + "", + PAD, + "", + "Repeated paragraph bound to its own section. ".repeat(2).trim(), + ].join("\n"), + mustCapture: ["Common", "Outer", "Inner"], + }, + { + // A legitimately nested heading whose text equals its parent's: `# Setup` + // containing `## Setup`. The body under the inner heading correctly binds + // headingPath ["Setup", "Setup"] — two consecutive same-text entries that + // are a REAL ancestor chain, not a fabricated repeat. The I3 soundness + // check must ACCEPT this (it cannot distinguish legit same-text nesting + // from a fabricated duplicate, so it must not reject consecutive repeats). + name: "same-named nested heading binds a same-text ancestor chain", + doc: [ + "# Setup", + "", + PAD, + "", + "## Setup", + "", + "Inner setup body content here.", + ].join("\n"), + mustCapture: ["Setup"], + }, + { + name: "chunk boundary landing mid-fence (large code block)", + doc: [ + "## Iota", + "", + "```js", + // A code block far larger than targetChars so the splitter is forced to + // cut inside it — no `#`-line in here may ever surface as a heading and + // the fence must never be severed. + Array.from( + { length: 80 }, + (_, i) => `// line ${i} # not a heading inside code`, + ).join("\n"), + "```", + "", + "Body after the oversized fenced block.", + ].join("\n"), + mustCapture: ["Iota"], + }, + { + name: "normal prose with no tricky structure", + doc: [ + "# Plain Title", + "", + "Just some ordinary prose with no fences or odd headings. " + .repeat(5) + .trim(), + ].join("\n"), + mustCapture: ["Plain Title"], + }, + { + // stripMdx CRITICAL: a fenced ```tsx block containing an `import` and a + // self-closing JSX tag whose attribute value contains a `>` must pass + // through VERBATIM — the import/JSX strip passes must be masked inside + // fences. Historically stripMdx ran its regexes over the whole body first, + // gutting this highest-value retrieval content. + name: "fenced tsx block with import + JSX (attr contains `>`) survives verbatim", + doc: [ + "## Usage", + "", + "Render the component like so:", + "", + "```tsx", + "import { Widget } from 'my-lib';", + 'import "./styles.css";', + 'export const App = () => ;', + "```", + "", + "Done.", + ].join("\n"), + mustCapture: ["Usage"], + mustPreserveVerbatim: [ + "import { Widget } from 'my-lib';", + 'import "./styles.css";', + 'export const App = () => ;', + ], + }, + { + // An UNCLOSED opening fence runs to END OF INPUT: every line after it + // (including a `#`-line) is code, not a heading. The chunker must not + // inject the in-fence `#`-line as a fake heading, and must not sever the + // (unclosed) fenced region. The oracle treats the fence as open to EOF. + name: "unclosed fence runs to EOF; inner `#`-line is not a heading", + doc: [ + "## Mu", + "", + "Intro prose before the unterminated fence block here.", + "", + "```js", + "// no closing fence below — everything to EOF is code", + "# this is not a heading, it is inside the open fence", + "const x = 1;", + ].join("\n"), + mustCapture: ["Mu"], + mustNotBeHeading: ["this is not a heading, it is inside the open fence"], + }, + { + // A LONGER closing fence (CommonMark allows the close to be ≥ the opener): + // a ```` (4-backtick) opener closed by a ````` (5-backtick) line. The + // chunker must recognize this close (not run the fence to EOF) and keep the + // following `## After Long Fence` heading as a real boundary/heading. + name: "longer closing fence (4-backtick open, 5-backtick close)", + doc: [ + "## Nu", + "", + PAD, + "", + "````md", + "```js still inside the outer fence```", + "# inner line is not a heading", + "`````", + "", + "## After Long Fence", + "", + "Body under the post-long-fence heading.", + ].join("\n"), + mustCapture: ["Nu", "After Long Fence"], + mustOpenChunk: ["After Long Fence"], + mustNotBeHeading: ["inner line is not a heading"], + }, + { + // TAB-separated heading as the ONLY split boundary: `##\tHeading` uses a + // TAB (not a space) after the hashes. CommonMark accepts a tab separator, + // so it is a real heading AND must be a split boundary (splitOnHeading + // historically required a literal space). No blank lines, so paragraph + // splitting cannot pre-empt; the tab heading is the sole boundary. + name: "TAB-separated heading is a real boundary (splitOnHeading)", + // No blank lines, so paragraph splitting yields a single part and cannot + // pre-empt the heading split — the TAB-separated `##\t…` heading is the + // SOLE available boundary. Each section is sized well past targetChars + // (target_tokens:100 ⇒ 400 chars) so a split is actually forced; a + // column-0-and-space-only splitter leaves the whole doc fused and both + // completeness (I3) and boundary (I5) fail. + doc: [ + "##\tFirst Tab Section", + ...Array.from( + { length: 16 }, + (_, i) => + `Alpha line ${i} carries enough content to grow the section.`, + ), + "##\tSecond Tab Section", + ...Array.from( + { length: 16 }, + (_, i) => + `Beta line ${i} carries enough content to grow the section.`, + ), + ].join("\n"), + mustCapture: ["First Tab Section", "Second Tab Section"], + mustOpenChunk: ["Second Tab Section"], + }, + { + // 0–3-space INDENTED oversized fence: an indented (2-space) ``` fence that + // is far larger than targetChars. The fence must be recognized despite the + // indent (segmentCodeBlocks historically required column 0), kept atomic + // (not severed), and its inner `#`-line must not become a heading. + name: "indented (2-space) oversized fence kept atomic, inner `#` not a heading", + doc: [ + "## Xi", + "", + " ```python", + " # indented-fence comment is not a heading", + ...Array.from({ length: 80 }, (_, i) => ` value_${i} = ${i} * 2`), + " ```", + "", + "Body after the indented fenced block.", + ].join("\n"), + mustCapture: ["Xi"], + mustNotBeHeading: ["indented-fence comment is not a heading"], + }, + { + // Heading-text precision: a closing-`#` sequence (`## Heading ##`) and a + // docs `{#anchor}` (`## Config {#configuration}`) must be stripped from the + // captured heading text (title + headingPath). The oracle independently + // strips them, so production must agree. + name: "closing-`#` sequence and `{#anchor}` are stripped from heading text", + doc: [ + "## Heading ##", + "", + PAD, + "", + "## Config {#configuration}", + "", + "Body under the anchored heading here.", + ].join("\n"), + mustCapture: ["Heading", "Config"], + mustNotBeHeading: [ + "Heading ##", + "Config {#configuration}", + "{#configuration}", + ], + }, + { + // Title-precision variant: the document's FIRST heading carries a closing + // `#` sequence; the derived title must be the stripped text "Welcome", not + // "Welcome #". Pins extractFirstHeading's use of the shared predicate. + name: "title from a heading with a trailing closing-`#` is stripped", + doc: [ + "# Welcome #", + "", + "Some introductory prose under the welcome heading.", + ].join("\n"), + mustCapture: ["Welcome"], + expectTitle: "Welcome", + }, + { + // stripMdx side-effect-import bug: a side-effect import (`import "./x.css";` + // with no `from`) before a heading must NOT cause the over-broad lazy + // `[\s\S]*?from` to swallow everything up to the NEXT import's `from`, + // deleting the heading + prose in between. Heading + prose must survive. + name: "side-effect import before a heading does not delete the heading/prose", + doc: [ + 'import "./globals.css";', + "", + "## Configuration", + "", + "Prose that must survive the side-effect import strip.", + "", + "import { Helper } from '@/components/Helper';", + "", + "More prose after the second (from-)import.", + ].join("\n"), + mustCapture: ["Configuration"], + mustPreserveVerbatim: [ + "Prose that must survive the side-effect import strip.", + "More prose after the second (from-)import.", + ], + }, + { + // stripMdx prose-import bug: an ordinary English sentence that happens to + // start with "import" and contain " from " ("import a value from ...") is + // PROSE, not an MDX import statement, and must NOT be deleted. The import + // regex must match a single logical import statement, not arbitrary prose. + name: "prose line that looks import-like is not deleted", + doc: [ + "## Notes", + "", + 'You can import a value from "the library" in your own code.', + "", + "And the rest of the prose continues normally afterwards.", + ].join("\n"), + mustCapture: ["Notes"], + mustPreserveVerbatim: [ + 'You can import a value from "the library" in your own code.', + ], + }, + { + // Inline code span: `
x
` inside single backticks is inline code + // and must survive verbatim — the JSX strip must not run inside inline + // spans. Secondary to fenced masking but in scope. + name: "inline code span with JSX-looking content survives verbatim", + doc: [ + "## Inline", + "", + "Use the `
x
` element, and also `import X from 'y'` inline.", + "", + "Trailing prose.", + ].join("\n"), + mustCapture: ["Inline"], + mustPreserveVerbatim: ["`
x
`", "`import X from 'y'`"], + }, + { + // Bug 1: a col-0 line ```js``` text is an INLINE code span (CommonMark + // §4.5: a backtick fence info string may not contain a backtick), NOT a + // fence opener. The buggy production opened a phantom fence that ran to + // EOF, masking the SECOND heading as in-fence "code" and dropping it. The + // second heading must be captured and no half-open fence may leak. + name: "col-0 inline triple-backtick span is not a fence (later heading survives)", + doc: [ + "## Alpha", + "", + PAD, + "", + "```js``` quick inline example, prose after.", + "", + "## Beta", + "", + PAD, + ].join("\n"), + mustCapture: ["Alpha", "Beta"], + mustOpenChunk: ["Beta"], + }, + { + // Bug 5: a heading carrying BOTH a `{#anchor}` and a trailing closing-`#` + // sequence (`## X {#a} ##`). Production stripHeadingText and the oracle + // normalizeHeadingText must agree on the stripped text "X" (both strip the + // closing-`#` run FIRST, then the anchor). Before the alignment the oracle + // produced "X {#a}" and disagreed with production's "X". + name: "heading with both `{#anchor}` and trailing closing-`#` strips to bare text", + doc: [ + "## X {#a} ##", + "", + "Body under the anchored, hash-closed heading here.", + ].join("\n"), + mustCapture: ["X"], + mustNotBeHeading: ["X {#a}", "X {#a} ##", "{#a}"], + }, + { + // Bug-S6: a doc whose FIRST content line is a 4-space-indented fence + // (` ```lang`). CommonMark treats 4+ leading spaces as INDENTED CODE, so + // this is NOT a column-0 fence opener — it must not be promoted to a + // half-open phantom fence in the SERVED chunk text (the final chunk trim + // historically stripped the doc's leading indent, turning ` ```lang` + // into a column-0 ` ```lang ` whose still-indented closing ` ``` ` no + // longer closes it). The heading after the indented block must still be a + // real captured boundary. PAD forces a split so `## Heading…` opens its own + // chunk and enters that chunk's headingPath. + name: "doc-start 4-space-indented fence is not a column-0 fence (heading still captured)", + doc: [ + " ```lang", + " indented code-ish content here", + " ```", + "", + "## Heading After Indented Block", + "", + PAD, + ].join("\n"), + mustCapture: ["Heading After Indented Block"], + mustOpenChunk: ["Heading After Indented Block"], + }, + ]; + + // Run the WHOLE corpus under both overlap settings: 0 (overlap disabled) AND + // the production default of 50 (overlap_tokens default × 4 = 200 overlap + // chars). Earlier rounds only ever exercised the corpus at overlap_tokens: 0, + // so applyOverlap was never run against fences — a fenced block ending a + // section had its lone closing-fence line prepended onto the next chunk by + // overlap, injecting a half-open fence that the I1 oracle catches only when + // overlap is actually applied. Parametrizing over both keeps the soundness / + // completeness / fidelity checks honest on the real production path too. + const OVERLAP_SETTINGS = [0, 50]; + + // Run the WHOLE corpus under BOTH line-ending conventions: the canonical LF + // form AND a `\r\n`-joined CRLF variant (Windows / core.autocrlf authoring). + // CRLF historically broke EVERY heading + fence detector ($/. do not match + // `\r`, and the chunker split on `\n` only), so a CRLF doc degraded to + // title=filename and headingPath=[] on every chunk and any fence ran to EOF. + // chunkMarkdown must normalize `\r\n` → `\n` up front, after which the CRLF + // variant must satisfy the identical structural invariants. The oracle is + // always computed on the normalized (LF) doc — it models the post-normalization + // content — while chunkMarkdown receives the raw (possibly CRLF) variant. + const LINE_ENDINGS: Array<{ label: string; apply: (doc: string) => string }> = + [ + { label: "LF", apply: (doc) => doc }, + { label: "CRLF", apply: (doc) => doc.replace(/\n/g, "\r\n") }, + ]; + + for (const overlap of OVERLAP_SETTINGS) { + for (const eol of LINE_ENDINGS) { + for (const tc of cases) { + it(`invariant holds (overlap_tokens=${overlap}, ${eol.label}): ${tc.name}`, () => { + // Oracle facts come from the NORMALIZED (LF) doc — chunkMarkdown + // normalizes CRLF → LF before any detection, so the expected headings, + // fence balance, and interiors are those of the LF form. + const allowed = realHeadings(tc.doc); + + // Capture any degradation warning the chunker emits (I2): the + // heading-path lookup only fails when a chunk's raw text is NOT a + // verbatim substring of the cleaned body, so a fired warning is a + // direct verbatim-fidelity break. + const warnings: string[] = []; + const originalWarn = console.warn; + console.warn = (...args: unknown[]) => { + warnings.push(args.join(" ")); + }; + let chunks; + try { + chunks = chunkMarkdown( + eol.apply(tc.doc), + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: overlap }), + ); + } finally { + console.warn = originalWarn; + } + expect(chunks.length).toBeGreaterThan(0); + + // (I1) FENCE INTEGRITY: when the SOURCE doc has balanced fences, every + // chunk must too (no half-open fence ⇒ no fenced block severed + // across a chunk boundary, and no overlap-injected lone fence + // delimiter opening a chunk). A doc with a deliberately UNCLOSED + // fence (runs to EOF) legitimately yields one trailing unbalanced + // chunk — that is the correct atomic behavior, not a severed + // block — so this per-chunk balance check is skipped there; that + // case is guarded instead by I3 soundness, mustNotBeHeading, I6. + if (fenceBalanced(tc.doc)) { + for (const chunk of chunks) { + expect(fenceBalanced(chunk.content)).toBe(true); + } + } + + // (I2) VERBATIM FIDELITY: no "heading-path lookup failed" warning + // fired, i.e. every raw chunk remained a verbatim substring of + // the body. + const degraded = warnings.filter((w) => + w.includes("heading-path lookup failed"), + ); + expect(degraded).toEqual([]); + + // (I3) SOUNDNESS: no chunk's headingPath contains anything that is not + // a real ATX heading outside fences. This catches fence-leak, + // bare-`##`, whitespace-only, and indented-`#` fabrications. + const seen = new Set(); + for (const chunk of chunks) { + const path = chunk.headingPath ?? []; + for (const h of path) { + expect(h).not.toBe(""); + expect(allowed.has(h)).toBe(true); + seen.add(h); + } + // NOTE: we intentionally do NOT assert path[i] !== path[i-1]. A + // legitimately nested heading whose text equals its parent's (e.g. + // `# Setup` containing `## Setup`) yields a REAL ancestor chain + // ["Setup", "Setup"]; a consecutive-duplicate check cannot tell that + // legit same-text nesting apart from a fabricated repeat, so it would + // false-FAIL on correct output. Soundness is already enforced by the + // allowed-set membership check above. + } + + // (I3) COMPLETENESS: every real heading that owns body content is + // captured by at least one chunk (none silently missed). + for (const h of tc.mustCapture ?? []) { + expect(allowed.has(h)).toBe(true); // oracle self-check + expect(seen.has(h)).toBe(true); + } + + // (I4) TITLE SOUNDNESS: the derived title (with no frontmatter, this + // is extractFirstHeading's result or the filename fallback) is + // either a real ATX heading outside fences or the filename — + // never an in-fence `#`-comment line, and never empty/undefined. + const title = chunks[0].title; + expect(typeof title).toBe("string"); + expect(allowed.has(title ?? "") || title === "test.md").toBe(true); + if (tc.expectTitle !== undefined) { + expect(title).toBe(tc.expectTitle); + } + + // (I5) SPLIT-BOUNDARY COMPLETENESS: each heading named in + // mustOpenChunk OPENS some chunk (it is a clean section boundary, + // not fused mid-chunk). A chunk "opens with" a heading when, + // after trimming leading blank lines, its first non-empty line is + // that heading (modulo CommonMark indent / closing-`#` / + // `{#anchor}`). This catches the splitOnHeading separator bug: a + // TAB- or indent-separated heading that feeds the path but is NOT + // used as a boundary stays fused inside a chunk and fails here. + // + // Overlap prepends a cleaned tail line to the FOLLOWING chunk, + // which can push the opening heading off line 1 of the stored + // content, so this boundary check runs only at overlap=0 (the + // boundary is unambiguous there). I3-completeness already guards + // capture under both overlap settings. + if (overlap === 0) { + const opensWith = (chunk: string, heading: string): boolean => { + const lines = chunk.split("\n"); + let i = 0; + while (i < lines.length && lines[i].trim() === "") i++; + if (i >= lines.length) return false; + const m = lines[i].match(/^ {0,3}(#{1,6})[ \t]+(\S.*?)\s*$/); + if (!m) return false; + return normalizeHeadingText(m[2]) === heading; + }; + for (const h of tc.mustOpenChunk ?? []) { + expect(allowed.has(h)).toBe(true); // oracle self-check + const opened = chunks.some((c) => opensWith(c.content, h)); + expect(opened, `heading "${h}" should open some chunk`).toBe( + true, + ); + } + } + + // (I6) FENCED-CONTENT PRESERVATION: for each fenced code block in the + // input, its exact interior (imports, JSX, inline-looking text) + // appears VERBATIM in some chunk's output. This is the direct + // guard on the stripMdx CRITICAL: the import/JSX strip passes + // must be masked inside fences so code survives untouched. We + // assert each non-empty interior LINE is a substring of some + // chunk (line-level keeps the check robust to atomic-block + // re-joining while still proving no in-fence line was deleted or + // rewritten). Chunk content is always normalized to LF, so the + // LF-derived interior lines match under the CRLF variant too. + for (const interior of fencedInteriors(tc.doc)) { + for (const line of interior.split("\n")) { + if (line.trim() === "") continue; + const present = chunks.some((c) => c.content.includes(line)); + expect(present, `fenced line not preserved: ${line}`).toBe(true); + } + } + + // Per-case reinforcement: explicit verbatim tokens (imports / JSX / + // inline spans) the over-broad strip historically destroyed. + for (const frag of tc.mustPreserveVerbatim ?? []) { + const present = chunks.some((c) => c.content.includes(frag)); + expect(present, `must preserve verbatim: ${frag}`).toBe(true); + } + + // Per-case reinforcement: strings that must NEVER be a heading-path + // entry or the title (fence-interior `#`-lines, un-stripped + // closing-`#`/anchor text, import-looking prose, etc.). + for (const notHeading of tc.mustNotBeHeading ?? []) { + expect(seen.has(notHeading)).toBe(false); + expect(title).not.toBe(notHeading); + } + }); + } + } + } +}); + +// ── Regression LEVERS: invariant property + timing guards ─────────────────── +// +// These three levers exist to make four load-bearing bugs UNREINTRODUCIBLE: +// 1. ReDoS in the MDX JSX-strip regexes (catastrophic backtracking on a +// prop-heavy paired/self-closing tag) + the `{expr > val}` strip gap. +// 2. The multi-line overlap branch prepending an unbalanced inline-code tail. +// 3. A side-effect `import "x";` whose `\s+` spanned a newline (content loss). +// 4. The line-split fallback severing an inline code span across a soft break. +// Each lever is RED at the pre-fix HEAD and GREEN once the four fixes land. +describe("chunkMarkdown regression levers", () => { + // Independent oracle (NOT imported from production): count inline-code + // backticks that fall OUTSIDE fenced-code regions. A balanced source inline + // span (`` `x` `` — two backticks) must never be SEVERED by splitting/overlap + // such that one chunk carries an odd number of backticks in its served text + // (i.e. opens a span it never closes). Fenced regions are masked first because + // their backticks are fence delimiters / verbatim code, not inline spans. + function fenceOpenLen(line: string): { char: string; len: number } | null { + const m = line.match(/^ {0,3}(`{3,}|~{3,})([^\n]*)$/); + if (!m) return null; + const char = m[1][0]; + // CommonMark §4.5: a backtick fence's info string may not contain a backtick + // (then the line is an inline span, not a fence opener); a tilde fence's may + // not contain a tilde. Mirrors production matchFenceOpen, re-derived here. + if (m[2].includes(char)) return null; + return { char, len: m[1].length }; + } + function fenceCloses( + line: string, + open: { char: string; len: number }, + ): boolean { + const fc = open.char === "`" ? "`" : "~"; + return new RegExp(`^ {0,3}(\\${fc}{${open.len},})[ \\t]*$`).test(line); + } + function inlineBacktickCountOutsideFences(text: string): number { + let open: { char: string; len: number } | null = null; + let count = 0; + for (const line of text.split("\n")) { + if (open) { + if (fenceCloses(line, open)) open = null; + continue; // fence-delimiter / in-fence line: not inline backticks + } + const o = fenceOpenLen(line); + if (o) { + open = o; + continue; + } + for (const ch of line) if (ch === "`") count++; + } + return count; + } + + // Independent oracle (NOT imported from production): is EVERY inline code span + // OUTSIDE fenced regions closed by EOF, pairing runs by EXACT length? The + // parity oracle above (inlineBacktickCountOutsideFences) only catches a + // SINGLE-backtick imbalance — a double-backtick `` `` `` span splits into two + // chunks each carrying a balanced-PARITY but unbalanced-RUN delimiter (each + // chunk has 2 backticks, parity even, yet one chunk has a dangling `` `` + // opener and the other a dangling `` `` closer). CommonMark §6.1: a run of N + // backticks opens a span closed only by a run of EXACTLY N. We mask fenced + // regions (their backticks are fence delimiters / verbatim code, not inline) + // and then, across the remaining text, require that no inline run is left + // open. Returns true when balanced. Mirrors production maskInlineCode's + // run-length pairing, RE-DERIVED here so a production bug cannot hide. + function inlineCodeBalancedExactRun(text: string): boolean { + // First, blank out fenced regions so their backticks are not scanned. + let open: { char: string; len: number } | null = null; + const nonFenceLines: string[] = []; + for (const line of text.split("\n")) { + if (open) { + if (fenceCloses(line, open)) open = null; + nonFenceLines.push(""); // in-fence content masked out + continue; + } + const o = fenceOpenLen(line); + if (o) { + open = o; + nonFenceLines.push(""); // fence-delimiter line masked out + continue; + } + nonFenceLines.push(line); + } + const s = nonFenceLines.join("\n"); + // Pair inline backtick runs by EXACT length over the masked text. A run of + // N opens a span that only a later run of EXACTLY N closes (a run of a + // different length inside an open span is literal content). We do NOT honor + // CommonMark's blank-line/heading span boundaries here on purpose: this + // oracle's job is to detect a DELIMITER left dangling in served chunk text, + // and the strictest "is there any unclosed run" check is the one that flags + // a severed multi-backtick span. If a run is left open at EOF, unbalanced. + let i = 0; + const n = s.length; + while (i < n) { + if (s[i] !== "`") { + i++; + continue; + } + let runEnd = i; + while (runEnd < n && s[runEnd] === "`") runEnd++; + const runLen = runEnd - i; + // Look for a closing run of EXACTLY runLen after this opener. + let scan = runEnd; + let close = -1; + while (scan < n) { + if (s[scan] !== "`") { + scan++; + continue; + } + let candEnd = scan; + while (candEnd < n && s[candEnd] === "`") candEnd++; + if (candEnd - scan === runLen) { + close = scan; + break; + } + scan = candEnd; + } + if (close === -1) return false; // opener with no exact-length closer + i = close + runLen; + } + return true; + } + + // Independent oracle (NOT imported from production): is the fence state + // balanced (every opened fence closed) at the END of `text`? A chunk whose + // code fence is half-open means a fenced block was severed across the chunk + // boundary. Re-derived line-by-line via the local fence predicates above. + function fenceBalancedOracle(text: string): boolean { + let open: { char: string; len: number } | null = null; + for (const line of text.split("\n")) { + if (open) { + if (fenceCloses(line, open)) open = null; + } else { + const o = fenceOpenLen(line); + if (o) open = o; + } + } + return open === null; + } + + // ── LEVER 1: inline-backtick balance is preserved across split + overlap ── + // + // Corpus of docs whose SOURCE inline code spans are all balanced (even + // backticks). For each doc × overlap_tokens{0,50} × line-ending{LF,CRLF}, + // every produced chunk must carry an EVEN number of inline backticks OUTSIDE + // fenced regions — no chunk may open an inline span it never closes in its + // served text. The corpus MUST include the S4 (overlap) and S7 (line-split) + // shapes; both sever a complete source span at the pre-fix HEAD. + // + // S4 — a soft-wrapped (single-newline-joined, no blank-line) paragraph large + // enough to split, with a COMPLETE inline span in the MIDDLE. At + // overlap_tokens=50 the multi-line overlap branch retains whole lines from a + // window that BEGINS inside the span, prepending its lone CLOSING backtick to + // the next chunk (Bug 2). At overlap_tokens=0 the span stays atomic. + const SPAN_S4 = [ + "## Intro", + "", + ...Array.from({ length: 60 }, (_, i) => + i === 30 + ? "Open the span here `SPAN_OPEN_MARKER and it keeps going on this line with words." + : i === 31 + ? "Still inside the span on this line with more words before it eventually closes up." + : i === 32 + ? "Finally the span SPAN_CLOSE_MARKER` closes right here and then prose continues fine." + : `Soft wrapped line ${i} carries prose to grow this paragraph along nicely here ok.`, + ), + ].join("\n"); + + // S7 — an oversized (> targetChars), GAPLESS (no blank lines) block with a + // COMPLETE inline span crossing a soft line break, whose two halves are each + // long enough that the line-split fallback's merge MUST break between them, + // landing the opening and closing backticks in adjacent chunks (Bug 4). RED at + // overlap_tokens=0 (the line-split fallback is independent of overlap). + const SPAN_HALF_A = "word ".repeat(50).trim(); + const SPAN_HALF_B = "term ".repeat(50).trim(); + const SPAN_S7 = [ + "## CodeTalk", + "", + "Lead prose line that opens the gapless block with content before the span begins okay here now.", + `Inline span begins now \`SPANOPEN ${SPAN_HALF_A}`, + `${SPAN_HALF_B} SPANCLOSE\` and then the span has closed and prose keeps going onward after it here.`, + "Trailing prose line continues the gapless block with more content after the inline span closes.", + ].join("\n"); + + // A2 — a DOUBLE-backtick inline span (`` `` … `` ``, CommonMark §6.1, used when + // the inline code itself contains a backtick) crossing a soft line break in an + // oversized GAPLESS paragraph. The two halves are each long enough that the + // line-split fallback's merge MUST break between them, landing the `` `` `` + // opener and the `` `` `` closer in adjacent chunks. At the pre-fix HEAD the + // grouping/guard parity decision (`backtickCount % 2`) sees TWO backticks on + // the opening line → parity EVEN immediately → the unit is flushed mid-span, so + // each chunk carries a balanced-PARITY (2) but unbalanced-RUN dangling `` `` + // delimiter. The parity oracle CANNOT see this; the exact-run oracle does. RED + // at overlap_tokens=0 (line-split is overlap-independent) AND 50. + const DBL_HALF_A = "alpha ".repeat(45).trim(); + const DBL_HALF_B = "omega ".repeat(45).trim(); + const SPAN_DBL = [ + "## DoubleTick", + "", + "Lead prose that opens the gapless block before the double-backtick span begins here for sure.", + `Here the span opens \`\`DBLOPEN ${DBL_HALF_A}`, + `${DBL_HALF_B} DBLCLOSE\`\` and now the double-backtick span has closed while prose keeps flowing on.`, + "Trailing prose line continues the gapless block with more content after the double-backtick span.", + ].join("\n"); + + // A1 — a SINGLE interior code-fence delimiter (a lone ```` ``` ````) inside a + // compact `~~~`-wrapped block (the real Markdown way to DISPLAY a fence + // delimiter: wrap it in `~~~`), followed GAPLESSLY by a trailing prose line and + // preceded gaplessly by a short intro line. The block is small enough that, at + // overlap_tokens=50 (window = 200 chars), the multi-line overlap window for the + // FOLLOWING chunk spans the WHOLE `~~~ … ``` … ~~~` block: the retained lines + // are fence-BALANCED (the `~~~` opens and closes), yet carry the interior + // ```` ``` ```` (3 backticks, ODD total) — and the window's LAST line is the + // trailing prose, NOT a fence delimiter, so guard (a) (last-line-only) misses + // it. At the pre-fix HEAD the FENCE-UNAWARE parity guard (b) misclassifies the + // interior ```` ``` ```` as an open inline span and DROPS up to and including + // it, severing the outer fence's opening `~~~` and leaving the closing `~~~` as + // a phantom opener → a HALF-OPEN fence in the served chunk. After the drop the + // backtick count is even, so the "if still odd" fallback never fires. The + // interior backticks are all INSIDE the `~~~` fence, so the inline-backtick + // oracles correctly see ZERO inline backticks. RED at overlap_tokens=50, + // balanced at 0. + const A1_PROSE = Array.from( + { length: 14 }, + (_, i) => + `Lead prose line ${i} fills this section out so the chunk boundary lands well.`, + ); + const SPAN_INTERIOR_FENCE = [ + "## FenceDisplay", + "", + ...A1_PROSE, + "", + "Intro line right before the tilde block here.", + "~~~md", + "Type this to close a fence:", + "```", + "done now", + "~~~", + "Tail prose right after the closing tilde fence keeps going on a gapless line here for sure.", + "", + "## NextSection", + "", + "Body prose for the next section so it owns indexable content of its own here now.", + ].join("\n"); + + const balanceCorpus: Array<{ name: string; doc: string }> = [ + { name: "S4 overlap-tail carries a lone backtick", doc: SPAN_S4 }, + { name: "S7 line-split severs an inline span", doc: SPAN_S7 }, + { + name: "A2 double-backtick span severed across a soft break", + doc: SPAN_DBL, + }, + { + name: "A1 interior ``` fence inside a ~~~ block (overlap window opens in-fence)", + doc: SPAN_INTERIOR_FENCE, + }, + { + // A complete inline span on a single short line — the simplest balanced + // shape; must remain balanced regardless of overlap / line-ending. + name: "single-line complete inline span", + doc: [ + "## Simple", + "", + "Use the `inline_code` token in the middle of this prose sentence here.", + "", + "More prose to follow afterward in a second paragraph for substance.", + ].join("\n"), + }, + { + // A complete span adjacent to a fenced block: the fence's backticks must + // be excluded by the oracle, and the inline span must stay balanced. + name: "inline span next to a fenced block", + doc: [ + "## Mixed", + "", + "Inline `tok` before a fence.", + "", + "```js", + "const x = 1;", + "```", + "", + "Inline `tok2` after the fence.", + ].join("\n"), + }, + ]; + + const BALANCE_OVERLAPS = [0, 50]; + const BALANCE_EOLS: Array<{ label: string; apply: (s: string) => string }> = [ + { label: "LF", apply: (s) => s }, + { label: "CRLF", apply: (s) => s.replace(/\n/g, "\r\n") }, + ]; + + for (const overlap of BALANCE_OVERLAPS) { + for (const eol of BALANCE_EOLS) { + for (const tc of balanceCorpus) { + it(`inline-backtick balance holds (overlap_tokens=${overlap}, ${eol.label}): ${tc.name}`, () => { + // Oracle self-check: the SOURCE doc has balanced inline backticks + // (parity AND exact-run) and balanced fences, so any imbalance in a + // produced chunk is a SEVERANCE introduced by splitting/overlap, never + // pre-existing source imbalance. + expect( + inlineBacktickCountOutsideFences(tc.doc) % 2, + "source corpus must have balanced inline backticks (parity)", + ).toBe(0); + expect( + inlineCodeBalancedExactRun(tc.doc), + "source corpus must have balanced inline code spans (exact run)", + ).toBe(true); + expect( + fenceBalancedOracle(tc.doc), + "source corpus must have balanced fences", + ).toBe(true); + + const chunks = chunkMarkdown( + eol.apply(tc.doc), + "test.md", + mkConfig({ target_tokens: 100, overlap_tokens: overlap }), + ); + expect(chunks.length).toBeGreaterThan(0); + for (const chunk of chunks) { + const n = inlineBacktickCountOutsideFences(chunk.content); + expect( + n % 2, + `chunk has unbalanced inline backticks parity (${n}): ${JSON.stringify( + chunk.content.slice(0, 120), + )}`, + ).toBe(0); + // A2: exact-run-length pairing catches a severed multi-backtick span + // that parity misses (each half has an even count but a dangling + // run). RED at HEAD for the double-backtick corpus shape. + expect( + inlineCodeBalancedExactRun(chunk.content), + `chunk leaves a multi-backtick delimiter unbalanced: ${JSON.stringify( + chunk.content.slice(0, 160), + )}`, + ).toBe(true); + // A1: no chunk may carry a half-open code fence (the guard-(b) drop + // must never sever a balanced fence). RED at HEAD for the interior- + // ```-fence corpus shape at overlap_tokens=50. + expect( + fenceBalancedOracle(chunk.content), + `chunk carries a half-open fence: ${JSON.stringify( + chunk.content.slice(0, 200), + )}`, + ).toBe(true); + } + }); + } + } + } + + // ── LEVER 2: ReDoS timing + `{expr > val}` strip completeness ──────────── + // + // A doc carrying BOTH a paired JSX component AND a self-closing JSX component + // each with 30–50 well-formed attributes must chunk in well under a small + // budget. At the pre-fix HEAD the paired regex `(?:"[^"]*"|'[^']*'|[^>])*` is + // ambiguous (a quoted attr matches BOTH alternatives), so a prop-heavy paired + // tag backtracks exponentially (~doubling per attribute: 17 attrs ≈ 0.6s, ~20 + // hangs) — far past 500ms. The MAX_STRIP_PASSES cap does NOT help: the blowup + // is inside ONE .replace(). The fix is a linear, single-pass JSX tag scanner. + it("strips prop-heavy paired AND self-closing JSX tags without catastrophic backtracking", () => { + const manyAttrs = (n: number): string => + Array.from({ length: n }, (_, i) => `attr${i}="value${i}"`).join(" "); + const paired = `inner paired content`; + const selfClosing = ``; + // Order is load-bearing for the RED proof: the self-closing tag comes FIRST + // (the self-closing pass consumes it cleanly), then the PAIRED tag with NO + // trailing `/>` after it. With the order reversed, the self-closing pass' + // global scan would greedily match from `` and + // delete the paired tag as a side effect BEFORE the paired regex runs, + // masking the hang. As written, the paired regex faces the 40-attr paired + // tag and backtracks exponentially (well past minutes) at the pre-fix HEAD. + const content = [ + "## Heavy", + "", + "Intro prose before the components.", + "", + selfClosing, + "", + paired, + "", + "Outro prose after the components.", + ].join("\n"); + + const start = Date.now(); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const elapsed = Date.now() - start; + + // Linear scanner ⇒ trivially under budget. The exponential regex blows + // well past this (it hangs for minutes) on a 40-attr paired tag, so at the + // pre-fix HEAD this test fails by TIMING OUT under vitest's testTimeout + // before ever reaching the assertions below. + expect( + elapsed, + `stripping prop-heavy JSX took ${elapsed}ms (catastrophic backtracking?)`, + ).toBeLessThan(500); + + const joined = chunks.map((c) => c.content).join("\n"); + // Both component tags are stripped; the paired tag's inner content is kept. + expect(joined).not.toContain("`", () => { + // ` c} />` — the `>` lives inside an unquoted JSX EXPRESSION, + // not a quoted string. At the pre-fix HEAD `[^>]*` truncates at that `>`, so + // the tag is NOT matched and SURVIVES into the served text. The linear + // scanner tracks `{…}` expression depth so the inner `>` no longer ends the + // tag, and the whole self-closing tag is stripped. + const content = [ + "Before the foo component here.", + "", + " c} />", + "", + "After the foo component here.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).not.toContain(" c"); + expect(joined).toContain("Before the foo component here."); + expect(joined).toContain("After the foo component here."); + }); + + // ── LEVER 3: a side-effect import spanning a newline must not eat prose ─── + // + // Mirrors the existing from-import blank-line test: a side-effect + // `import "x";` (no `from`) on its OWN line, then a blank line, then a + // quoted-string line, with prose on both sides. At the pre-fix HEAD the + // side-effect-import regex used `import\s+['"]…`, and `\s` matches a newline, + // so `import\n\n"./x.css";` was treated as ONE statement and the whole span + // (including any masked prose between, and the surrounding blank lines) was + // collapsed — destroying content. The fix is `[ \t]+`, keeping the statement + // on a single logical line. The dangling `import` / quoted-string lines (which + // are NOT a single import) and the surrounding prose must all survive. + it("does not let a side-effect import strip span a blank line", () => { + const content = [ + "First real paragraph that must survive the side-effect import strip.", + "", + "import", + "", + '"./styles.css";', + "", + "Second real paragraph that must survive the side-effect import strip.", + ].join("\n"); + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain( + "First real paragraph that must survive the side-effect import strip.", + ); + expect(joined).toContain( + "Second real paragraph that must survive the side-effect import strip.", + ); + // The `import` / `"./styles.css";` lines span a blank line, so they are NOT + // a single side-effect import statement and must NOT be stripped — the + // over-broad `import\s+['"]…` (with `\s` matching `\n`) deleted the span. + expect(joined).toContain("import"); + expect(joined).toContain('"./styles.css";'); + }); + + it("still strips a normal single-line side-effect import", () => { + // Regression guard for the blank-line fix: a real single-line side-effect + // import must still be stripped. + const content = + 'import "./globals.css";\n\nProse after the side-effect import.'; + const chunks = chunkMarkdown(content, "test.mdx", mkConfig()); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).not.toContain('import "./globals.css";'); + expect(joined).toContain("Prose after the side-effect import."); + }); +}); + +// ── Inlined-snippet byte normalization ────────────────────────────────────── +// +// chunkMarkdown normalizes the HOST content up front: CRLF → LF and the four PUA +// sentinels U+E000–U+E003 are stripped BEFORE any parsing/masking. But snippet +// imports (`@/snippets/*.mdx`) are inlined AFTER that, by inlineSnippetImports, +// which reads each snippet file RAW from disk (fs.readFileSync, no line-ending +// normalization, no PUA strip) and injects those bytes into the body. stripMdx +// does not strip `\r` either, so a CRLF- or PUA-authored snippet would otherwise +// bypass BOTH host normalizations: +// - CRLF: the single-line heading/fence predicates ($/. do not match `\r`) +// fail on the inlined snippet lines, so the snippet's headings degrade to +// headingPath=[], the title falls back to the filename, and `\r` leaks into +// served chunk content — SILENTLY (the inlined text is still a verbatim +// substring of the post-inline body, so the heading-path warning never +// fires). +// - PUA: a literal sentinel in a snippet survives into the masking passes, +// breaking the "placeholder namespace is exclusively ours downstream" +// guarantee and potentially surviving into a served chunk. +// chunkMarkdown must re-apply BOTH normalizations to the inlined body so +// reinjected snippet bytes match the host's normalization. +// +// These tests exercise the REAL snippet path: a temp dir holds a `snippets/` +// subtree (so inlineSnippetImports' findAliasRoot locates the `@/` alias root) +// plus a host doc that imports the snippet. The bad bytes are written RAW to the +// snippet file so they only ever enter the pipeline through fs.readFileSync — +// exactly the path that bypasses the host normalization. A real temp file (over +// a module mock) verifies the on-disk read end-to-end and matches this suite's +// mock-free convention. +describe("chunkMarkdown inlined-snippet byte normalization", () => { + const tmpDirs: string[] = []; + + function makeProject( + snippetRelPath: string, + snippetBytes: string, + ): { + hostAbsPath: string; + hostBody: string; + } { + // A unique project root containing snippets/ (the alias root marker) and a + // docs/ subdir holding the host page. findAliasRoot walks up from the host + // dir until it finds an ancestor containing snippets/, so the host must live + // BELOW the root that holds snippets/. + const root = fs.mkdtempSync(path.join(os.tmpdir(), "pf-snippet-test-")); + tmpDirs.push(root); + const snippetAbs = path.join(root, "snippets", snippetRelPath); + fs.mkdirSync(path.dirname(snippetAbs), { recursive: true }); + // Write RAW bytes — no normalization — so the bad bytes only enter via the + // inlineSnippetImports fs.readFileSync path. + fs.writeFileSync(snippetAbs, snippetBytes, "utf-8"); + const docsDir = path.join(root, "docs"); + fs.mkdirSync(docsDir, { recursive: true }); + const hostAbsPath = path.join(docsDir, "host.mdx"); + const hostBody = [ + `import Snippet from "@/snippets/${snippetRelPath}";`, + "", + "Host intro paragraph before the snippet.", + "", + "", + "", + "Host outro paragraph after the snippet.", + ].join("\n"); + return { hostAbsPath, hostBody }; + } + + afterEach(() => { + while (tmpDirs.length) { + const dir = tmpDirs.pop(); + if (dir) fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it("normalizes CRLF in an inlined snippet (headings captured, title from snippet, no \\r leaks)", () => { + // The snippet is CRLF-authored: two `##` headings + a fenced code block, all + // with `\r\n` line endings. Padding pushes each section past the target so + // the splitter cuts on the snippet's headings (and the post-fence heading + // opens its own chunk, proving the fence closed rather than running to EOF). + const big = "Word ".repeat(120).trim(); + const snippetBytes = [ + "## Snippet Alpha", + "", + big, + "", + "```js", + "const fromSnippet = 1;", + "```", + "", + "## Snippet Beta", + "", + big, + ].join("\r\n"); + const { hostAbsPath, hostBody } = makeProject( + "shared/crlf-snippet.mdx", + snippetBytes, + ); + + const chunks = chunkMarkdown( + hostBody, + "host.mdx", + mkConfig({ target_tokens: 100, overlap_tokens: 0 }), + hostAbsPath, + ); + expect(chunks.length).toBeGreaterThan(0); + + // (a) The snippet's headings ARE captured — headingPath is NOT the degraded + // [] that a trailing-`\r` line produces. + const allHeadings = new Set(chunks.flatMap((c) => c.headingPath ?? [])); + expect(allHeadings.has("Snippet Alpha")).toBe(true); + expect(allHeadings.has("Snippet Beta")).toBe(true); + + // (b) The derived title is the snippet's first heading (the host has no + // frontmatter and no heading of its own), NOT the filename fallback. + expect(chunks[0].title).toBe("Snippet Alpha"); + expect(chunks[0].title).not.toBe("host.mdx"); + + // (c) No `\r` survives into any served chunk content. + for (const chunk of chunks) { + expect(chunk.content.includes("\r")).toBe(false); + } + }); + + it("strips a literal PUA sentinel that arrives via an inlined snippet", () => { + // The snippet prose carries the four masking sentinels (U+E000–U+E003), + // including a heading-sentinel SHAPE pointing at index 0 — the same + // collision the host-side strip neutralizes, but arriving through the + // raw-read snippet path that bypasses it. + const OPEN_H = String.fromCharCode(0xe002); + const CLOSE_H = String.fromCharCode(0xe003); + const OPEN_C = String.fromCharCode(0xe000); + const CLOSE_C = String.fromCharCode(0xe001); + const snippetBytes = [ + "## Snippet Heading", + "", + `Snippet prose with a literal ${OPEN_H}0${CLOSE_H} sentinel and a ${OPEN_C}1${CLOSE_C} code sentinel.`, + ].join("\n"); + const { hostAbsPath, hostBody } = makeProject( + "shared/pua-snippet.mdx", + snippetBytes, + ); + + const chunks = chunkMarkdown( + hostBody, + "host.mdx", + mkConfig({ target_tokens: 80, overlap_tokens: 0 }), + hostAbsPath, + ); + expect(chunks.length).toBeGreaterThan(0); + + // No PUA sentinel may survive into any served chunk content. + for (const chunk of chunks) { + expect(/[\u{E000}-\u{E003}]/u.test(chunk.content)).toBe(false); + } + }); +}); diff --git a/src/__tests__/orchestrator-dedup-parallel.test.ts b/src/__tests__/orchestrator-dedup-parallel.test.ts index 16eadf5..6ceb05c 100644 --- a/src/__tests__/orchestrator-dedup-parallel.test.ts +++ b/src/__tests__/orchestrator-dedup-parallel.test.ts @@ -126,8 +126,10 @@ vi.mock("../indexing/embeddings.js", () => { vi.mock("../indexing/pipeline.js", () => { return { IndexingPipeline: class MockIndexingPipeline { - indexItems = vi.fn().mockResolvedValue(undefined); - removeItems = vi.fn().mockResolvedValue(undefined); + // indexItems/removeItems now return { failedIds } so the orchestrator can + // hold the state token back on per-item failure (C1). + indexItems = vi.fn().mockResolvedValue({ failedIds: [] }); + removeItems = vi.fn().mockResolvedValue({ failedIds: [] }); }, }; }); diff --git a/src/__tests__/orchestrator-source-reindex.test.ts b/src/__tests__/orchestrator-source-reindex.test.ts index d17e5bb..ce04736 100644 --- a/src/__tests__/orchestrator-source-reindex.test.ts +++ b/src/__tests__/orchestrator-source-reindex.test.ts @@ -91,8 +91,10 @@ vi.mock("../indexing/embeddings.js", () => { vi.mock("../indexing/pipeline.js", () => { return { IndexingPipeline: class MockIndexingPipeline { - indexItems = vi.fn().mockResolvedValue(undefined); - removeItems = vi.fn().mockResolvedValue(undefined); + // indexItems/removeItems now return { failedIds } so the orchestrator can + // hold the state token back on per-item failure (C1). + indexItems = vi.fn().mockResolvedValue({ failedIds: [] }); + removeItems = vi.fn().mockResolvedValue({ failedIds: [] }); }, }; }); diff --git a/src/__tests__/orchestrator-state-token-hold.test.ts b/src/__tests__/orchestrator-state-token-hold.test.ts new file mode 100644 index 0000000..fdf7e66 --- /dev/null +++ b/src/__tests__/orchestrator-state-token-hold.test.ts @@ -0,0 +1,229 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +// Regression for C1 (silent data loss): when the pipeline reports that one or +// more items failed to index/remove, the orchestrator must NOT advance the +// index state token (last_commit_sha). Advancing it would leave the failed +// items behind the new token so the next incremental run never re-diffs them — +// permanent silent loss. Instead the orchestrator holds the prior token and +// marks the run errored so the next run reprocesses the failed items. +// +// These mocks let each test control what the pipeline's indexItems/removeItems +// return (the failedIds) and assert on what gets written to index_state. + +const { + mockGetIndexState, + mockUpsertIndexState, + mockIndexItems, + mockRemoveItems, + mockFullAcquire, + mockIncrementalAcquire, +} = vi.hoisted(() => ({ + mockGetIndexState: vi.fn(), + mockUpsertIndexState: vi.fn(), + mockIndexItems: vi.fn(), + mockRemoveItems: vi.fn(), + mockFullAcquire: vi.fn(), + mockIncrementalAcquire: vi.fn(), +})); + +vi.mock("../config.js", () => ({ + getConfig: vi.fn().mockReturnValue({ + databaseUrl: "postgresql://test", + openaiApiKey: "test-key", + githubToken: "", + githubWebhookSecret: "", + port: 3001, + nodeEnv: "test", + logLevel: "info", + cloneDir: "/tmp/test", + slackBotToken: "", + slackSigningSecret: "", + discordBotToken: "", + notionToken: "", + }), + getServerConfig: vi.fn().mockReturnValue({ + server: { name: "test", version: "1.0" }, + sources: [ + { + name: "docs", + type: "markdown", + path: "/tmp/docs", + file_patterns: ["**/*.md"], + chunk: {}, + }, + ], + tools: [ + { + name: "search", + type: "search", + description: "Search", + source: "docs", + default_limit: 5, + max_limit: 20, + result_format: "docs", + }, + ], + embedding: { + provider: "openai", + model: "text-embedding-3-small", + dimensions: 1536, + }, + indexing: { + auto_reindex: false, + reindex_hour_utc: 3, + stale_threshold_hours: 24, + }, + }), + getIndexableSourceNames: vi.fn().mockReturnValue(new Set(["docs"])), + getAnalyticsConfig: vi.fn().mockReturnValue(undefined), +})); + +vi.mock("../db/queries.js", () => ({ + getIndexState: (...args: unknown[]) => mockGetIndexState(...args), + upsertIndexState: (...args: unknown[]) => mockUpsertIndexState(...args), + cleanupOldWebhookDeliveries: vi.fn().mockResolvedValue(0), +})); + +vi.mock("../db/analytics.js", () => ({ + cleanupOldQueryLogs: vi.fn().mockResolvedValue(0), +})); + +vi.mock("../indexing/embeddings.js", () => { + class MockEmbeddingProvider { + embed = vi.fn().mockResolvedValue([0.1, 0.2]); + embedBatch = vi.fn().mockResolvedValue([[0.1, 0.2]]); + } + return { + EmbeddingClient: MockEmbeddingProvider, + createEmbeddingProvider: () => new MockEmbeddingProvider(), + }; +}); + +vi.mock("../indexing/pipeline.js", () => ({ + IndexingPipeline: class MockIndexingPipeline { + indexItems = mockIndexItems; + removeItems = mockRemoveItems; + }, +})); + +vi.mock("../indexing/providers/index.js", () => ({ + getProvider: vi.fn().mockReturnValue(() => ({ + fullAcquire: mockFullAcquire, + incrementalAcquire: mockIncrementalAcquire, + getCurrentStateToken: vi.fn().mockResolvedValue("new-token"), + })), +})); + +import { IndexingOrchestrator } from "../indexing/orchestrator.js"; + +/** Drive a source-reindex job to completion and return when drain settles. */ +async function runSourceReindex( + orchestrator: IndexingOrchestrator, +): Promise { + const done = new Promise((resolve) => { + orchestrator.onReindexComplete = () => resolve(); + }); + orchestrator.queueSourceReindex("docs"); + // The job resolves onReindexComplete only when affectedSourceNames is + // non-empty (always true for "docs"); fall back to a bounded poll so a + // path that returns early still lets the test proceed. + await Promise.race([ + done, + (async () => { + for (let i = 0; i < 50; i++) { + await new Promise((r) => setTimeout(r, 50)); + if (!orchestrator.isIndexing()) return; + } + })(), + ]); + // Give the final microtasks (status writes) a tick to flush. + await new Promise((r) => setTimeout(r, 50)); +} + +describe("IndexingOrchestrator state-token hold on item failure (C1)", () => { + let orchestrator: IndexingOrchestrator; + + beforeEach(() => { + vi.clearAllMocks(); + orchestrator = new IndexingOrchestrator(); + // Prior indexed state with an OLD token; incremental path will be taken. + mockGetIndexState.mockResolvedValue({ + source_type: "markdown", + source_key: "docs", + last_commit_sha: "old-token", + last_indexed_at: new Date(), + status: "idle", + error_message: null, + }); + mockIndexItems.mockResolvedValue({ failedIds: [] }); + mockRemoveItems.mockResolvedValue({ failedIds: [] }); + }); + + it("does NOT advance the state token when an item fails to index", async () => { + mockIncrementalAcquire.mockResolvedValue({ + items: [ + { id: "docs/ok.md", content: "a" }, + { id: "docs/bad.md", content: "b" }, + ], + removedIds: [], + stateToken: "new-token", + }); + // One item failed. + mockIndexItems.mockResolvedValue({ failedIds: ["docs/bad.md"] }); + + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + await runSourceReindex(orchestrator); + errSpy.mockRestore(); + + // The success-path upsert (which would persist last_commit_sha:"new-token") + // must NOT have been called. + const advancedToNewToken = mockUpsertIndexState.mock.calls.some( + (c) => c[0]?.last_commit_sha === "new-token", + ); + expect(advancedToNewToken).toBe(false); + + // The run is marked errored while PRESERVING the prior token, so the next + // incremental run re-diffs from "old-token" and reprocesses docs/bad.md. + const errorWrite = mockUpsertIndexState.mock.calls + .map((c) => c[0]) + .find((s) => s?.status === "error"); + expect(errorWrite).toBeDefined(); + expect(errorWrite.last_commit_sha).toBe("old-token"); + }); + + it("does NOT advance the state token when a removal fails", async () => { + mockIncrementalAcquire.mockResolvedValue({ + items: [], + removedIds: ["docs/gone.md"], + stateToken: "new-token", + }); + mockRemoveItems.mockResolvedValue({ failedIds: ["docs/gone.md"] }); + + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + await runSourceReindex(orchestrator); + errSpy.mockRestore(); + + const advancedToNewToken = mockUpsertIndexState.mock.calls.some( + (c) => c[0]?.last_commit_sha === "new-token", + ); + expect(advancedToNewToken).toBe(false); + }); + + it("ADVANCES the state token when every item indexes successfully", async () => { + mockIncrementalAcquire.mockResolvedValue({ + items: [{ id: "docs/ok.md", content: "a" }], + removedIds: ["docs/gone.md"], + stateToken: "new-token", + }); + mockIndexItems.mockResolvedValue({ failedIds: [] }); + mockRemoveItems.mockResolvedValue({ failedIds: [] }); + + await runSourceReindex(orchestrator); + + const advanced = mockUpsertIndexState.mock.calls + .map((c) => c[0]) + .find((s) => s?.last_commit_sha === "new-token"); + expect(advanced).toBeDefined(); + expect(advanced.status).toBe("idle"); + }); +}); diff --git a/src/__tests__/pipeline.test.ts b/src/__tests__/pipeline.test.ts index fa63f56..5806fc3 100644 --- a/src/__tests__/pipeline.test.ts +++ b/src/__tests__/pipeline.test.ts @@ -3,17 +3,28 @@ import { IndexingPipeline } from "../indexing/pipeline.js"; import type { ContentItem } from "../indexing/providers/types.js"; import type { SourceConfig } from "../types.js"; -// Mock the dependencies -vi.mock("../indexing/chunking/index.js", () => ({ - getChunker: vi - .fn() - .mockReturnValue((content: string, _filePath: string, _config: unknown) => [ +// Mock the dependencies. The inner chunker is a vi.fn (hoisted so the vi.mock +// factory can close over it) so tests can assert the full argument list — the +// real ChunkerFn and call site pass a 4th arg, item.absolutePath, in addition +// to content/filePath/config. +const { mockChunkerFn } = vi.hoisted(() => ({ + mockChunkerFn: vi.fn( + ( + content: string, + _filePath: string, + _config: unknown, + _absolutePath?: string, + ) => [ { content, title: "Test Title", chunkIndex: 0, }, - ]), + ], + ), +})); +vi.mock("../indexing/chunking/index.js", () => ({ + getChunker: vi.fn().mockReturnValue(mockChunkerFn), })); vi.mock("../indexing/embeddings.js", () => { @@ -26,7 +37,7 @@ vi.mock("../indexing/embeddings.js", () => { }); vi.mock("../db/queries.js", () => ({ - upsertChunks: vi.fn().mockResolvedValue(undefined), + replaceChunksForFile: vi.fn().mockResolvedValue(undefined), deleteChunksByFile: vi.fn().mockResolvedValue(undefined), })); @@ -34,7 +45,8 @@ vi.mock("../indexing/url-derivation.js", () => ({ deriveUrl: () => "https://example.com/test", })); -const { upsertChunks, deleteChunksByFile } = await import("../db/queries.js"); +const { replaceChunksForFile, deleteChunksByFile } = + await import("../db/queries.js"); const { EmbeddingClient } = await import("../indexing/embeddings.js"); const testConfig: SourceConfig = { @@ -50,20 +62,31 @@ describe("IndexingPipeline", () => { const embeddingClient = new EmbeddingClient("key", "model", 1536); const pipeline = new IndexingPipeline(embeddingClient, testConfig); + mockChunkerFn.mockClear(); const items: ContentItem[] = [ { id: "docs/test.md", + absolutePath: "/abs/clone/docs/test.md", content: "# Hello\nSome content here", }, ]; await pipeline.indexItems(items, "abc123"); - expect(deleteChunksByFile).toHaveBeenCalledWith( - "test-source", + // The chunker receives item.absolutePath as its 4th argument (some chunkers + // need the on-disk path, e.g. for language/extension-aware splitting). + expect(mockChunkerFn).toHaveBeenCalledWith( + "# Hello\nSome content here", "docs/test.md", + testConfig, + "/abs/clone/docs/test.md", ); - expect(upsertChunks).toHaveBeenCalledWith( + + // The delete+upsert is now a SINGLE atomic call so a failed upsert cannot + // leave the file's chunks deleted-but-not-replaced (data loss). + expect(replaceChunksForFile).toHaveBeenCalledWith( + "test-source", + "docs/test.md", expect.arrayContaining([ expect.objectContaining({ source_name: "test-source", @@ -74,23 +97,38 @@ describe("IndexingPipeline", () => { ); }); - it("skips items that produce zero chunks", async () => { + it("clears stale chunks for items that now produce zero chunks", async () => { + // A file that previously indexed N chunks but now yields zero (and is + // routed through `items`, not `removedIds`) must have its stale chunks + // cleared — NOT left in the index forever. The zero-chunk path calls + // replaceChunksForFile(name, id, []) (the delete-only transaction) instead + // of early-returning. Embedding is skipped (no chunks to embed). const { getChunker } = await import("../indexing/chunking/index.js"); vi.mocked(getChunker).mockReturnValueOnce(() => []); const embeddingClient = new EmbeddingClient("key", "model", 1536); const pipeline = new IndexingPipeline(embeddingClient, testConfig); - vi.mocked(upsertChunks).mockClear(); + vi.mocked(replaceChunksForFile).mockClear(); + vi.mocked(embeddingClient.embedBatch).mockClear(); await pipeline.indexItems([{ id: "empty.md", content: "" }], "abc"); - expect(upsertChunks).not.toHaveBeenCalled(); + + // Delete-only call with an EMPTY chunk array clears any prior chunks. + expect(replaceChunksForFile).toHaveBeenCalledWith( + "test-source", + "empty.md", + [], + ); + // No embedding round-trip when there are no chunks to embed. + expect(embeddingClient.embedBatch).not.toHaveBeenCalled(); }); it("removes items by ID", async () => { const embeddingClient = new EmbeddingClient("key", "model", 1536); const pipeline = new IndexingPipeline(embeddingClient, testConfig); - vi.mocked(deleteChunksByFile).mockClear(); + vi.mocked(deleteChunksByFile).mockReset(); + vi.mocked(deleteChunksByFile).mockResolvedValue(undefined); await pipeline.removeItems(["docs/old.md", "docs/deleted.md"]); expect(deleteChunksByFile).toHaveBeenCalledTimes(2); @@ -104,11 +142,65 @@ describe("IndexingPipeline", () => { ); }); + it("continues removing remaining ids when one delete fails and reports the failed id", async () => { + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + vi.mocked(deleteChunksByFile).mockReset(); + // First id fails; the batch must NOT abort — the remaining ids still run. + vi.mocked(deleteChunksByFile) + .mockRejectedValueOnce(new Error("delete boom")) + .mockResolvedValueOnce(undefined) + .mockResolvedValueOnce(undefined); + + // The failed id MUST be returned so the caller holds the state token back. + const { failedIds } = await pipeline.removeItems([ + "docs/bad.md", + "docs/ok1.md", + "docs/ok2.md", + ]); + expect(failedIds).toEqual(["docs/bad.md"]); + + expect(deleteChunksByFile).toHaveBeenCalledTimes(3); + expect(deleteChunksByFile).toHaveBeenCalledWith( + "test-source", + "docs/ok1.md", + ); + expect(deleteChunksByFile).toHaveBeenCalledWith( + "test-source", + "docs/ok2.md", + ); + // The failure was logged via the pipeline's logPrefix and the FULL error + // object (not just err.message) so the stack survives. + expect(errSpy).toHaveBeenCalledWith( + expect.stringContaining( + "[pipeline:test-source] Failed to remove docs/bad.md", + ), + expect.any(Error), + ); + errSpy.mockRestore(); + }); + + it("reports an empty failedIds array when all removes succeed", async () => { + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + vi.mocked(deleteChunksByFile).mockReset(); + vi.mocked(deleteChunksByFile).mockResolvedValue(undefined); + + const { failedIds } = await pipeline.removeItems([ + "docs/a.md", + "docs/b.md", + ]); + expect(failedIds).toEqual([]); + }); + it("passes sourceUrl from ContentItem when provided", async () => { const embeddingClient = new EmbeddingClient("key", "model", 1536); const pipeline = new IndexingPipeline(embeddingClient, testConfig); - vi.mocked(upsertChunks).mockClear(); + vi.mocked(replaceChunksForFile).mockClear(); await pipeline.indexItems( [ { @@ -120,7 +212,9 @@ describe("IndexingPipeline", () => { "abc", ); - expect(upsertChunks).toHaveBeenCalledWith( + expect(replaceChunksForFile).toHaveBeenCalledWith( + "test-source", + "docs/test.md", expect.arrayContaining([ expect.objectContaining({ source_url: "https://custom.url/test", @@ -128,4 +222,176 @@ describe("IndexingPipeline", () => { ]), ); }); + + it("embeds the chunk title and headingPath alongside the content", async () => { + const { getChunker } = await import("../indexing/chunking/index.js"); + vi.mocked(getChunker).mockReturnValueOnce(() => [ + { + content: "The body of the chunk", + title: "useCopilotAction", + headingPath: ["Reference", "Hooks"], + chunkIndex: 0, + }, + ]); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + await pipeline.indexItems( + [{ id: "docs/hooks.md", content: "irrelevant" }], + "abc", + ); + + const embedBatch = vi.mocked(embeddingClient.embedBatch); + expect(embedBatch).toHaveBeenCalledTimes(1); + const embeddedText = embedBatch.mock.calls[0][0][0]; + expect(embeddedText).toContain("useCopilotAction"); + expect(embeddedText).toContain("Reference"); + expect(embeddedText).toContain("Hooks"); + expect(embeddedText).toContain("The body of the chunk"); + }); + + it("embeds content gracefully when a code chunk has no heading", async () => { + const { getChunker } = await import("../indexing/chunking/index.js"); + vi.mocked(getChunker).mockReturnValueOnce(() => [ + { + content: "export function foo() {}", + chunkIndex: 0, + }, + ]); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + await pipeline.indexItems( + [{ id: "src/foo.ts", content: "irrelevant" }], + "abc", + ); + + const embedBatch = vi.mocked(embeddingClient.embedBatch); + expect(embedBatch).toHaveBeenCalledTimes(1); + const embeddedText = embedBatch.mock.calls[0][0][0]; + // No leading/trailing newlines from absent title/headingPath. + expect(embeddedText).toBe("export function foo() {}"); + }); + + it("keeps the chunk's headingPath even when item.metadata supplies one", async () => { + const { getChunker } = await import("../indexing/chunking/index.js"); + vi.mocked(getChunker).mockReturnValueOnce(() => [ + { + content: "The body of the chunk", + title: "useCopilotAction", + headingPath: ["Reference", "Hooks"], + chunkIndex: 0, + }, + ]); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + vi.mocked(replaceChunksForFile).mockClear(); + await pipeline.indexItems( + [ + { + id: "docs/hooks.md", + content: "irrelevant", + // A provider that (incorrectly) sets headingPath must NOT clobber the + // chunk-derived headingPath, which is embedded into the vector and is + // load-bearing for retrieval. + metadata: { headingPath: ["Wrong", "Provider", "Path"], custom: "x" }, + }, + ], + "abc", + ); + + const upserted = vi.mocked(replaceChunksForFile).mock.calls[0][2]; + expect(upserted[0].metadata).toMatchObject({ + headingPath: ["Reference", "Hooks"], + custom: "x", + }); + }); + + it("throws when embedBatch returns fewer embeddings than texts", async () => { + const { getChunker } = await import("../indexing/chunking/index.js"); + // Two chunks → two texts to embed. + vi.mocked(getChunker).mockReturnValueOnce(() => [ + { content: "chunk one", chunkIndex: 0 }, + { content: "chunk two", chunkIndex: 1 }, + ]); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + // Stub the provider to return only ONE embedding for the two texts. + vi.mocked(embeddingClient.embedBatch).mockResolvedValueOnce([ + [0.1, 0.2, 0.3], + ]); + + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + // indexItems swallows per-item errors, so exercise indexItem directly to + // assert the loud failure on the embedding-count mismatch. + const indexItem = ( + pipeline as unknown as { + indexItem(item: ContentItem, stateToken: string): Promise; + } + ).indexItem.bind(pipeline); + + await expect( + indexItem({ id: "docs/two.md", content: "irrelevant" }, "abc"), + ).rejects.toThrow(/Embedding count mismatch for item docs\/two\.md/); + }); + + it("swallows a replaceChunksForFile failure per item, continues the batch, and reports the failed id", async () => { + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + vi.mocked(replaceChunksForFile).mockReset(); + // First item's atomic replace throws; the loop must log + continue so the + // second item is still indexed. + vi.mocked(replaceChunksForFile) + .mockRejectedValueOnce(new Error("replace boom")) + .mockResolvedValueOnce(undefined); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + // The failed id is RETURNED (so the orchestrator holds the token back) and + // the successful item still indexes. + const { failedIds } = await pipeline.indexItems( + [ + { id: "docs/bad.md", content: "a" }, + { id: "docs/good.md", content: "b" }, + ], + "abc", + ); + expect(failedIds).toEqual(["docs/bad.md"]); + + expect(replaceChunksForFile).toHaveBeenCalledTimes(2); + // The successful item still wrote its chunks. + expect(replaceChunksForFile).toHaveBeenCalledWith( + "test-source", + "docs/good.md", + expect.any(Array), + ); + // The failure logs the FULL error object (not just err.message). + expect(errSpy).toHaveBeenCalledWith( + expect.stringContaining("Failed to index docs/bad.md"), + expect.any(Error), + ); + errSpy.mockRestore(); + // Restore the default resolved behavior for any later tests. + vi.mocked(replaceChunksForFile).mockReset(); + vi.mocked(replaceChunksForFile).mockResolvedValue(undefined); + }); + + it("reports an empty failedIds array when all items index successfully", async () => { + vi.mocked(replaceChunksForFile).mockReset(); + vi.mocked(replaceChunksForFile).mockResolvedValue(undefined); + + const embeddingClient = new EmbeddingClient("key", "model", 1536); + const pipeline = new IndexingPipeline(embeddingClient, testConfig); + + const { failedIds } = await pipeline.indexItems( + [{ id: "docs/ok.md", content: "a" }], + "abc", + ); + expect(failedIds).toEqual([]); + }); }); diff --git a/src/__tests__/queries-numeric-coercion.test.ts b/src/__tests__/queries-numeric-coercion.test.ts new file mode 100644 index 0000000..ccb5a0b --- /dev/null +++ b/src/__tests__/queries-numeric-coercion.test.ts @@ -0,0 +1,140 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +// Mock the db client to intercept pool.query calls. Mirrors faq-queries.test.ts. +const mockQuery = vi.fn(); +vi.mock("../db/client.js", () => ({ + getPool: () => ({ query: mockQuery }), +})); + +// pgvector.toSql is called by searchChunks on the embedding param; stub it so +// the mapper-under-test runs without a real pgvector dependency. +vi.mock("pgvector", () => ({ + default: { toSql: (v: unknown) => v }, +})); + +import { + searchChunks, + getIndexStats, + getWebhookDeliveryStats, +} from "../db/queries.js"; + +// These guards exist because node-postgres deserializes numeric columns as +// STRINGS, and Number() of a non-numeric string (e.g. "high") yields NaN — a +// NaN similarity corrupts sort order / top_score, and a string count leaks into +// the health-endpoint stats. (Number(null) and Number("") are 0, the desired +// default; only non-numeric strings reach the NaN→0 guard.) Both are coerced +// through a toFiniteNumber discipline (mirroring getAnalyticsSummary). These +// tests pin the value-level coercion the mock-pool FAQ tests don't probe. + +function row(overrides: Record = {}): Record { + return { + id: 1, + source_name: "docs", + source_url: null, + title: null, + content: "x", + repo_url: null, + file_path: "a.md", + start_line: null, + end_line: null, + language: null, + similarity: "0.42", + ...overrides, + }; +} + +describe("searchChunks similarity coercion", () => { + beforeEach(() => mockQuery.mockReset()); + + it("parses a string similarity into a finite number", async () => { + mockQuery.mockResolvedValueOnce({ rows: [row({ similarity: "0.42" })] }); + const [r] = await searchChunks([0.1], 5); + expect(r.similarity).toBe(0.42); + expect(Number.isFinite(r.similarity)).toBe(true); + }); + + it("coerces a null/non-numeric similarity to 0 instead of NaN", async () => { + mockQuery.mockResolvedValueOnce({ + rows: [ + row({ id: 1, similarity: null }), + row({ id: 2, similarity: "not-a-number" }), + ], + }); + const results = await searchChunks([0.1], 5); + // A NaN here would corrupt the downstream sort / top_score. + expect(results[0].similarity).toBe(0); + expect(results[1].similarity).toBe(0); + expect(results.every((r) => Number.isFinite(r.similarity))).toBe(true); + }); +}); + +describe("getIndexStats numeric coercion", () => { + beforeEach(() => mockQuery.mockReset()); + + it("coerces string counts (node-postgres int-as-string) to finite numbers", async () => { + // The four Promise.all queries resolve in order: total, by-source, repos, + // index_state. node-postgres returns count(*)::int as a STRING. + mockQuery + .mockResolvedValueOnce({ rows: [{ count: "1234" }] }) // total + .mockResolvedValueOnce({ + rows: [{ source_name: "docs", count: "42" }], + }) // by source + .mockResolvedValueOnce({ rows: [{ count: "7" }] }) // repos + .mockResolvedValueOnce({ rows: [] }); // index_state + + const stats = await getIndexStats(); + + expect(stats.totalChunks).toBe(1234); + expect(typeof stats.totalChunks).toBe("number"); + expect(stats.indexedRepos).toBe(7); + expect(stats.bySource[0].count).toBe(42); + expect(typeof stats.bySource[0].count).toBe("number"); + }); + + it("defaults a missing/non-numeric count to 0", async () => { + mockQuery + .mockResolvedValueOnce({ rows: [{}] }) // total — no count key + .mockResolvedValueOnce({ rows: [] }) // by source + .mockResolvedValueOnce({ rows: [{ count: null }] }) // repos — null + .mockResolvedValueOnce({ rows: [] }); // index_state + + const stats = await getIndexStats(); + expect(stats.totalChunks).toBe(0); + expect(stats.indexedRepos).toBe(0); + }); +}); + +describe("getWebhookDeliveryStats numeric coercion", () => { + beforeEach(() => mockQuery.mockReset()); + + it("coerces by_decision string counts (node-postgres int-as-string) to numbers", async () => { + // The three Promise.all queries resolve in order: per-decision counts, + // last delivery, error rows. node-postgres returns count(*)::int as a + // STRING, so the per-decision map must coerce — by_decision is declared + // Record and serialized into the /health endpoint, so a + // raw string ("accept": "5") is a user-facing type violation. + mockQuery + .mockResolvedValueOnce({ + rows: [ + { decision: "accept", count: "5" }, + { decision: "reject", count: "2" }, + ], + }) // per-decision counts + .mockResolvedValueOnce({ rows: [] }) // last delivery + .mockResolvedValueOnce({ rows: [] }); // error rows + + const stats = await getWebhookDeliveryStats(); + + // Every by_decision value must be a NUMBER, not the raw driver string. + for (const [decision, count] of Object.entries(stats.by_decision)) { + expect(typeof count).toBe("number"); + expect(Number.isFinite(count)).toBe(true); + void decision; + } + expect(stats.by_decision.accept).toBe(5); + expect(stats.by_decision.reject).toBe(2); + // total_24h sums the same coerced counts. + expect(stats.total_24h).toBe(7); + expect(typeof stats.total_24h).toBe("number"); + }); +}); diff --git a/src/__tests__/replace-chunks.test.ts b/src/__tests__/replace-chunks.test.ts new file mode 100644 index 0000000..382fd37 --- /dev/null +++ b/src/__tests__/replace-chunks.test.ts @@ -0,0 +1,106 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import type { Chunk } from "../types.js"; + +// Mock the db client to intercept the pooled-client lifecycle. replaceChunksForFile +// must acquire a single client via pool.connect() and run BEGIN → DELETE → INSERT… +// → COMMIT (or ROLLBACK on failure) on THAT client, mirroring upsertChunks. This +// follows the faq-queries.test.ts pattern of mocking ../db/client.js. + +const clientQuery = vi.fn(); +const clientRelease = vi.fn(); +const connect = vi.fn(async () => ({ + query: clientQuery, + release: clientRelease, +})); + +vi.mock("../db/client.js", () => ({ + getPool: () => ({ connect }), +})); + +// Import AFTER mocking so queries.ts binds to the mocked getPool. +import { replaceChunksForFile } from "../db/queries.js"; + +function mkChunk(overrides: Partial = {}): Chunk { + return { + source_name: "docs", + source_url: null, + title: "T", + content: "body", + embedding: [0.1, 0.2, 0.3], + repo_url: null, + file_path: "docs/a.md", + start_line: null, + end_line: null, + language: null, + chunk_index: 0, + metadata: {}, + commit_sha: "sha", + version: null, + ...overrides, + }; +} + +describe("replaceChunksForFile (atomic delete + insert)", () => { + beforeEach(() => { + clientQuery.mockReset(); + clientRelease.mockReset(); + connect.mockClear(); + clientQuery.mockResolvedValue({ rows: [], rowCount: 0 }); + }); + + it("runs DELETE then INSERTs inside a single BEGIN/COMMIT on one client", async () => { + await replaceChunksForFile("docs", "docs/a.md", [ + mkChunk({ chunk_index: 0 }), + mkChunk({ chunk_index: 1 }), + ]); + + // One client acquired and released. + expect(connect).toHaveBeenCalledTimes(1); + expect(clientRelease).toHaveBeenCalledTimes(1); + + const issued = clientQuery.mock.calls.map((c) => String(c[0])); + expect(issued[0]).toBe("BEGIN"); + // DELETE is scoped to the (source_name, file_path) pair and runs before any insert. + expect(issued[1]).toContain("DELETE FROM chunks"); + expect(clientQuery.mock.calls[1][1]).toEqual(["docs", "docs/a.md"]); + // Two inserts for two chunks. + const insertCount = issued.filter((s) => + s.includes("INSERT INTO chunks"), + ).length; + expect(insertCount).toBe(2); + // Commits, never rolls back, on the happy path. + expect(issued).toContain("COMMIT"); + expect(issued).not.toContain("ROLLBACK"); + }); + + it("ROLLS BACK and rethrows when an INSERT fails, leaving pre-existing chunks intact", async () => { + // BEGIN ok, DELETE ok, first INSERT throws. + clientQuery + .mockResolvedValueOnce({ rows: [] }) // BEGIN + .mockResolvedValueOnce({ rows: [], rowCount: 1 }) // DELETE + .mockRejectedValueOnce(new Error("insert exploded")) // INSERT #1 + .mockResolvedValue({ rows: [] }); // ROLLBACK + + await expect( + replaceChunksForFile("docs", "docs/a.md", [mkChunk()]), + ).rejects.toThrow("insert exploded"); + + const issued = clientQuery.mock.calls.map((c) => String(c[0])); + // The transaction must be rolled back (NOT committed) so the DELETE never + // becomes durable — the file's PRE-EXISTING chunks survive intact, not zero. + expect(issued).toContain("ROLLBACK"); + expect(issued).not.toContain("COMMIT"); + // Client is always released even on the error path. + expect(clientRelease).toHaveBeenCalledTimes(1); + }); + + it("performs a delete-only transaction when given an empty chunk array", async () => { + await replaceChunksForFile("docs", "docs/gone.md", []); + + const issued = clientQuery.mock.calls.map((c) => String(c[0])); + expect(issued[0]).toBe("BEGIN"); + expect(issued[1]).toContain("DELETE FROM chunks"); + expect(issued.some((s) => s.includes("INSERT INTO chunks"))).toBe(false); + expect(issued).toContain("COMMIT"); + }); +}); diff --git a/src/__tests__/schema.test.ts b/src/__tests__/schema.test.ts index 32df670..7134448 100644 --- a/src/__tests__/schema.test.ts +++ b/src/__tests__/schema.test.ts @@ -187,6 +187,7 @@ describe("generatePostSchemaMigration", () => { "latency_ms", "source_name", "session_id", + "request_source", "created_at", ]) { expect(sql).toContain(col); @@ -202,4 +203,45 @@ describe("generatePostSchemaMigration", () => { "CREATE INDEX IF NOT EXISTS idx_query_log_tool_name ON query_log (tool_name)", ); }); + + it("adds request_source via idempotent ADD COLUMN IF NOT EXISTS for back-compat", () => { + // Installs whose query_log predates the request_source column must pick it + // up without a destructive migration. The ALTER ... ADD COLUMN IF NOT + // EXISTS is what makes re-running the post-schema migration safe on both + // fresh and existing databases. + const sql = generatePostSchemaMigration(); + expect(sql).toContain( + "ALTER TABLE query_log ADD COLUMN IF NOT EXISTS request_source TEXT", + ); + }); + + it("indexes request_source for audience-filtered analytics reads", () => { + const sql = generatePostSchemaMigration(); + expect(sql).toContain( + "CREATE INDEX IF NOT EXISTS idx_query_log_request_source ON query_log (request_source)", + ); + }); + + it("creates the webhook_deliveries table and its indexes", () => { + // The JSDoc on generatePostSchemaMigration claims it also creates + // webhook_deliveries (alongside query_log) — lock that claim so the + // doc and the DDL stay in sync. + const sql = generatePostSchemaMigration(); + expect(sql).toContain("CREATE TABLE IF NOT EXISTS webhook_deliveries"); + expect(sql).toContain( + "CREATE INDEX IF NOT EXISTS idx_webhook_deliveries_source ON webhook_deliveries (source)", + ); + expect(sql).toContain( + "CREATE INDEX IF NOT EXISTS idx_webhook_deliveries_delivered_at ON webhook_deliveries (delivered_at)", + ); + }); + + it("does NOT append the tsvector trigger DDL (returned separately)", () => { + // generateTsvTriggerDdl() is the source of the trigger DDL; the + // post-schema migration returns only the core (PGlite-safe) SQL. The + // JSDoc must not claim the trigger is "appended" here. + const sql = generatePostSchemaMigration(); + expect(sql).not.toContain("CREATE TRIGGER"); + expect(sql).not.toContain("chunks_tsv_trigger"); + }); }); diff --git a/src/__tests__/search-analytics.test.ts b/src/__tests__/search-analytics.test.ts index 37650d5..4a4ba8d 100644 --- a/src/__tests__/search-analytics.test.ts +++ b/src/__tests__/search-analytics.test.ts @@ -246,4 +246,109 @@ describe("search tool analytics instrumentation", () => { const [entry] = mockLogQuery.mock.calls[0]; expect(entry.top_score).toBeCloseTo(0.95); }); + + it("logs null session_id / request_source when no accessors are wired", async () => { + // The default registration (no options) must still produce a valid row — + // the writer defaults a null request_source to 'user', and session_id + // stays null when there's no session context to thread. + mockGetAnalyticsConfig.mockReturnValue({ + enabled: true, + log_queries: true, + retention_days: 90, + }); + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks.mockResolvedValueOnce([makeChunkResult()]); + mockLogQuery.mockResolvedValueOnce(undefined); + + await client.callTool({ + name: "search-docs", + arguments: { query: "test" }, + }); + await new Promise((r) => setTimeout(r, 10)); + + const [entry] = mockLogQuery.mock.calls[0]; + expect(entry.session_id).toBeNull(); + expect(entry.request_source).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// session_id + request_source threading from the MCP session context +// +// Regression for the observability gap: session_id was hardcoded null on every +// query_log row and there was no request-origin tag at all. The tool handler +// must thread both through from the accessors createMcpServer passes in +// (getSessionId from the transport, getRequestSource from X-Pathfinder-Source). +// --------------------------------------------------------------------------- + +describe("search tool threads session_id and request_source into logQuery", () => { + let client: Client; + let server: McpServer; + let currentSessionId: string | undefined; + let currentRequestSource: string | undefined; + + beforeAll(async () => { + server = new McpServer({ name: "test", version: "1.0.0" }); + registerSearchTool( + server as never, + { embed: mockEmbed } as never, + toolConfig, + { + // Late-bound accessors, mirroring how server.ts wires the real ones: + // the session id isn't known until the transport connects, and the + // request source is captured from the init request header. + getSessionId: () => currentSessionId, + getRequestSource: () => currentRequestSource, + }, + ); + + const [clientTransport, serverTransport] = + InMemoryTransport.createLinkedPair(); + await server.connect(serverTransport); + + client = new Client({ name: "test-client", version: "1.0.0" }); + await client.connect(clientTransport); + }); + + beforeEach(() => { + vi.clearAllMocks(); + mockGetAnalyticsConfig.mockReturnValue({ + enabled: true, + log_queries: true, + retention_days: 90, + }); + }); + + afterAll(async () => { + await client.close(); + await server.close(); + }); + + it("persists the resolved session_id (not null)", async () => { + currentSessionId = "mcp-session-abc"; + currentRequestSource = "user"; + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks.mockResolvedValueOnce([makeChunkResult()]); + mockLogQuery.mockResolvedValueOnce(undefined); + + await client.callTool({ name: "search-docs", arguments: { query: "q" } }); + await new Promise((r) => setTimeout(r, 10)); + + const [entry] = mockLogQuery.mock.calls[0]; + expect(entry.session_id).toBe("mcp-session-abc"); + }); + + it("persists the resolved request_source tag", async () => { + currentSessionId = "mcp-session-xyz"; + currentRequestSource = "synthetic"; + mockEmbed.mockResolvedValueOnce([0.1]); + mockSearchChunks.mockResolvedValueOnce([makeChunkResult()]); + mockLogQuery.mockResolvedValueOnce(undefined); + + await client.callTool({ name: "search-docs", arguments: { query: "q" } }); + await new Promise((r) => setTimeout(r, 10)); + + const [entry] = mockLogQuery.mock.calls[0]; + expect(entry.request_source).toBe("synthetic"); + }); }); diff --git a/src/__tests__/snippet-inline.test.ts b/src/__tests__/snippet-inline.test.ts new file mode 100644 index 0000000..3de2823 --- /dev/null +++ b/src/__tests__/snippet-inline.test.ts @@ -0,0 +1,273 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import path from "node:path"; +import os from "node:os"; +import { chunkMarkdown } from "../indexing/chunking/markdown.js"; +import { inlineSnippetImports } from "../indexing/chunking/snippets.js"; +import type { SourceConfig } from "../types.js"; + +// Helper to build a minimal markdown SourceConfig. +function mkConfig( + overrides: { target_tokens?: number; overlap_tokens?: number } = {}, +): SourceConfig { + return { + name: "test", + type: "markdown", + path: "/tmp", + file_patterns: ["**/*.mdx"], + chunk: { + target_tokens: overrides.target_tokens, + overlap_tokens: overrides.overlap_tokens, + }, + } as SourceConfig; +} + +/** + * Build a CopilotKit-style docs tree in a temp dir: + * + * /docs/snippets/ + * /docs/content/docs/ + * + * The `@/` alias maps to the docs project root (`/docs`), so + * `@/snippets/foo.mdx` resolves to `/docs/snippets/foo.mdx`. + * + * Returns the absolute path of the host file. + */ +function buildDocsTree( + root: string, + hostRelPath: string, + hostContent: string, + snippets: Record, +): string { + const docsRoot = path.join(root, "docs"); + for (const [rel, body] of Object.entries(snippets)) { + const abs = path.join(docsRoot, "snippets", rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, body, "utf-8"); + } + const hostAbs = path.join(docsRoot, "content", "docs", hostRelPath); + fs.mkdirSync(path.dirname(hostAbs), { recursive: true }); + fs.writeFileSync(hostAbs, hostContent, "utf-8"); + return hostAbs; +} + +describe("inlineSnippetImports", () => { + let tmp: string; + + beforeEach(() => { + tmp = fs.mkdtempSync(path.join(os.tmpdir(), "pf-snippet-")); + }); + + afterEach(() => { + fs.rmSync(tmp, { recursive: true, force: true }); + }); + + it("inlines a self-closing snippet component referenced via @/ alias", () => { + const snippetBody = + "## Overview\n\nCopilotKit V2 consolidates the frontend into a single package."; + const hostContent = [ + "---", + "title: Migrate to V2", + "---", + 'import MigrateToV2 from "@/snippets/shared/troubleshooting/migrate-to-v2.mdx";', + "", + "", + "", + ].join("\n"); + + const hostAbs = buildDocsTree( + tmp, + "(root)/migration-guides/migrate-to-v2.mdx", + hostContent, + { "shared/troubleshooting/migrate-to-v2.mdx": snippetBody }, + ); + + const result = inlineSnippetImports(hostContent, hostAbs); + + expect(result).toContain( + "CopilotKit V2 consolidates the frontend into a single package", + ); + // The import statement and JSX usage should no longer be present + expect(result).not.toContain('from "@/snippets'); + expect(result).not.toContain(" { + const inner = "### Inner Snippet\n\nDeeply nested inlined content here."; + const outer = [ + "## Outer Snippet", + "", + 'import Inner from "@/snippets/inner.mdx";', + "", + "", + "", + ].join("\n"); + const hostContent = [ + "---", + "title: Host", + "---", + 'import Outer from "@/snippets/outer.mdx";', + "", + "", + ].join("\n"); + + const hostAbs = buildDocsTree(tmp, "guide.mdx", hostContent, { + "outer.mdx": outer, + "inner.mdx": inner, + }); + + const result = inlineSnippetImports(hostContent, hostAbs); + + expect(result).toContain("Outer Snippet"); + expect(result).toContain("Deeply nested inlined content here"); + }); + + it("leaves the original usage and skips when the snippet file is missing", () => { + const hostContent = [ + 'import Missing from "@/snippets/does-not-exist.mdx";', + "", + "", + "", + "Body text after.", + ].join("\n"); + + const hostAbs = buildDocsTree(tmp, "guide.mdx", hostContent, {}); + + // Must not throw, and must not lose the rest of the document. + const result = inlineSnippetImports(hostContent, hostAbs); + expect(result).toContain("Body text after."); + }); + + it("preserves prose between a self-closing and a paired use of the same snippet", () => { + const snippetBody = "## Snippet Body\n\nReusable inlined snippet content."; + const hostContent = [ + "---", + "title: Host", + "---", + 'import Reused from "@/snippets/reused.mdx";', + "", + "", + "", + "PROSE-BETWEEN-USES should not be deleted.", + "", + "ignored inner", + "", + ].join("\n"); + + const hostAbs = buildDocsTree(tmp, "guide.mdx", hostContent, { + "reused.mdx": snippetBody, + }); + + const result = inlineSnippetImports(hostContent, hostAbs); + + // The between-text must survive (regression: paired regex used to swallow + // from the self-closing tag through the first closing tag, deleting it). + expect(result).toContain("PROSE-BETWEEN-USES should not be deleted."); + // Both uses must be replaced by the snippet body. + expect(result).not.toContain(" { + const a = ['import B from "@/snippets/b.mdx";', "", "A-body", ""].join( + "\n", + ); + const b = ['import A from "@/snippets/a.mdx";', "", "B-body", ""].join( + "\n", + ); + const hostContent = ['import A from "@/snippets/a.mdx";', "", ""].join( + "\n", + ); + + const hostAbs = buildDocsTree(tmp, "guide.mdx", hostContent, { + "a.mdx": a, + "b.mdx": b, + }); + + // Should terminate (not infinite-loop) and include the bodies it can. + const result = inlineSnippetImports(hostContent, hostAbs); + expect(result).toContain("A-body"); + }); + + it("removes ALL occurrences of a duplicated identical import line", () => { + const snippetBody = "## Dup Snippet\n\nInlined duplicate snippet body."; + // The SAME import line appears twice (a real hazard when pages are + // assembled/merged), separated by prose so each is captured identically and + // collapses to a single decl. First-occurrence string replace removes only + // one copy, leaving the other as a dangling import in the stripped output; + // the global removal must strip every copy. + const importLine = 'import Dup from "@/snippets/dup.mdx";'; + const hostContent = [ + "---", + "title: Host", + "---", + importLine, + "", + "Some prose between the two imports.", + "", + importLine, + "", + "", + "", + ].join("\n"); + + const hostAbs = buildDocsTree(tmp, "guide.mdx", hostContent, { + "dup.mdx": snippetBody, + }); + + const result = inlineSnippetImports(hostContent, hostAbs); + + expect(result).toContain("Inlined duplicate snippet body."); + // NEITHER import line may survive — not even the duplicate copy. + expect(result).not.toContain('from "@/snippets/dup.mdx"'); + expect(result).not.toMatch(/^import Dup/m); + // The prose between the imports must be preserved. + expect(result).toContain("Some prose between the two imports."); + }); +}); + +describe("chunkMarkdown with snippet inlining", () => { + let tmp: string; + + beforeEach(() => { + tmp = fs.mkdtempSync(path.join(os.tmpdir(), "pf-snippet-chunk-")); + }); + + afterEach(() => { + fs.rmSync(tmp, { recursive: true, force: true }); + }); + + it("produces chunk text containing the inlined snippet body", () => { + const snippetBody = + "## Overview\n\nCopilotKit V2 consolidates the frontend into a single package. Both hooks and UI components are now exported from one place."; + const hostContent = [ + "---", + "title: Migrate to V2", + "---", + 'import MigrateToV2 from "@/snippets/shared/troubleshooting/migrate-to-v2.mdx";', + "", + "", + "", + ].join("\n"); + + const hostAbs = buildDocsTree( + tmp, + "(root)/migration-guides/migrate-to-v2.mdx", + hostContent, + { "shared/troubleshooting/migrate-to-v2.mdx": snippetBody }, + ); + + const chunks = chunkMarkdown(hostContent, hostAbs, mkConfig()); + + // Before the fix this page indexes as empty (0 chunks); after, the snippet + // body must be present in the produced chunk text. + expect(chunks.length).toBeGreaterThanOrEqual(1); + const joined = chunks.map((c) => c.content).join("\n"); + expect(joined).toContain( + "CopilotKit V2 consolidates the frontend into a single package", + ); + }); +}); diff --git a/src/db/analytics.ts b/src/db/analytics.ts index 26560ad..2fcf0bd 100644 --- a/src/db/analytics.ts +++ b/src/db/analytics.ts @@ -12,6 +12,30 @@ import { getPool } from "./client.js"; */ export const REDACTED_QUERY_TEXT = ""; +/** + * Sentinel written to `query_log.query_text` by the knowledge-tool browse + * path (empty query → "return all FAQ entries above confidence"). It is a + * synthetic marker, NOT a real user search, so the empty-queries reader + * excludes it the same way it excludes {@link REDACTED_QUERY_TEXT} — an empty + * browse call should not surface as a literal `` row in the + * Empty-Result dashboard. The value MUST stay in sync with the literal logged + * in `src/mcp/tools/knowledge.ts`. + */ +export const BROWSE_QUERY_TEXT = ""; + +/** + * Sentinel `days` value meaning "all time" — no lower time bound at all. The + * dashboard's "All time" preset sends this (docs/analytics.html + * ALL_TIME_DAYS) and the server's `days` parser admits it (server.ts + * MAX_DAYS=100000 stays above it). When a windowed reader is asked for a + * window `>= ALL_TIME_DAYS`, {@link buildDateWindow} OMITS the lower-bound + * `created_at >=` clause entirely instead of clamping to + * {@link ROLLING_WINDOW_CAP_DAYS}, so the summary/aggregate cards genuinely + * span every row — including history older than the rolling cap. Exported so + * the server-side comment and tests share one source of truth. + */ +export const ALL_TIME_DAYS = 99999; + /** * Cap on the number of latency rows fetched for p95 computation. PGlite * doesn't support `percentile_cont`, so we pull latencies to JS and sort @@ -28,6 +52,71 @@ export const REDACTED_QUERY_TEXT = ""; */ export const P95_LATENCY_ROW_CAP = 100000; +/** + * Score threshold below which a non-empty result set is considered + * "low confidence". A query that returned rows but whose best match scored + * under this value is surfaced separately so operators can spot content gaps + * that look like hits but aren't actually relevant. Exported so tool handlers, + * readers, and tests share a single source of truth. + * + * Predicate (matches the brief): `result_count > 0 AND top_score < 0.5`. + * `top_score IS NULL` (e.g. browse/keyword rows that never compute a cosine + * score) is intentionally NOT low-confidence — absence of a score is not a + * low score. + */ +export const LOW_CONFIDENCE_SCORE_THRESHOLD = 0.5; + +/** + * Canonical request-origin tags persisted on `query_log.request_source`. + * Sourced from the `X-Pathfinder-Source` request header on the MCP init + * request. Anything outside this set (including a missing header) is coerced + * to {@link DEFAULT_REQUEST_SOURCE} at the edge so the column only ever holds + * a known value going forward. + */ +export const REQUEST_SOURCE_VALUES = ["user", "synthetic", "analysis"] as const; +export type RequestSource = (typeof REQUEST_SOURCE_VALUES)[number]; + +/** + * HTTP header that carries the request-origin tag on the MCP init request. + * Lower-cased because Node/Express normalize header names to lower case on + * `req.headers`. Exported so the server-side capture site and tests share one + * literal. + */ +export const REQUEST_SOURCE_HEADER = "x-pathfinder-source"; + +/** + * Default request source when the `X-Pathfinder-Source` header is absent or + * not one of {@link REQUEST_SOURCE_VALUES}. New traffic without an explicit + * tag is treated as a real user — the conservative choice that keeps the + * default KPIs (which exclude synthetic/analysis) honest. + */ +export const DEFAULT_REQUEST_SOURCE: RequestSource = "user"; + +/** + * Request-source values that count as "real users" for the default analytics + * KPIs. Includes NULL (historical rows predating the column) via the SQL in + * {@link buildRequestSourceClause}. Synthetic/analysis traffic is excluded by + * default and only included when the caller explicitly asks for an + * all-sources view (see {@link AnalyticsFilter.request_source}). + */ +export const REAL_USER_REQUEST_SOURCES: readonly RequestSource[] = ["user"]; + +/** + * Normalize an arbitrary `X-Pathfinder-Source` header value to a known + * {@link RequestSource}. Unknown/empty/missing values fall back to + * {@link DEFAULT_REQUEST_SOURCE}. Case-insensitive and whitespace-trimmed so + * `"Synthetic"` / `" analysis "` still tag correctly. + */ +export function normalizeRequestSource( + value: string | null | undefined, +): RequestSource { + if (typeof value !== "string") return DEFAULT_REQUEST_SOURCE; + const v = value.trim().toLowerCase(); + return (REQUEST_SOURCE_VALUES as readonly string[]).includes(v) + ? (v as RequestSource) + : DEFAULT_REQUEST_SOURCE; +} + // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- @@ -40,6 +129,13 @@ export interface QueryLogEntry { latency_ms: number; source_name: string | null; session_id: string | null; + /** + * Request-origin tag (user|synthetic|analysis) from X-Pathfinder-Source. + * Optional on the entry so existing call sites that don't tag still compile; + * the writer coerces an absent/unknown value to {@link DEFAULT_REQUEST_SOURCE} + * so the persisted column is always a known value for new rows. + */ + request_source?: RequestSource | string | null; } export interface AnalyticsSummary { @@ -47,6 +143,22 @@ export interface AnalyticsSummary { total_queries_window: number; empty_result_count_window: number; empty_result_rate_window: number; + /** + * Count of "low confidence" queries in the window: rows that returned at + * least one result but whose top_score fell below + * {@link LOW_CONFIDENCE_SCORE_THRESHOLD}. Computed over the SAME population + * as the other windowed cards (backfilled + redacted rows excluded, default + * request-source filter applied) so it's directly comparable to + * total_queries_window. Rows with a NULL top_score are NOT counted — + * absence of a score is not a low score. + */ + low_confidence_count_window: number; + /** + * low_confidence_count_window / total_queries_window (0 when the window is + * empty). Surfaced alongside empty_result_rate_window so the dashboard can + * show "looks like a hit but isn't relevant" as its own signal. + */ + low_confidence_rate_window: number; avg_latency_ms_window: number; p95_latency_ms_window: number; /** @@ -109,10 +221,28 @@ export interface AnalyticsFilter { * gardening/probe traffic does not inflate human/agent usage metrics. */ include_service_traffic?: boolean; + /** + * Request-origin filter for the analytics readers. + * + * - `undefined` (the default): restrict to REAL USER traffic — + * `request_source IN ('user') OR request_source IS NULL`. This is what + * makes the dashboard's KPIs default to real users while still counting + * historical rows (NULL) that predate the column. + * - `"all"`: no request-source restriction — every row regardless of origin. + * Use for the explicit "all sources" dashboard view. + * - a specific {@link RequestSource} (`"user"` | `"synthetic"` | `"analysis"`): + * restrict to exactly that origin. `"user"` here ALSO includes NULL rows + * (they're real users); `"synthetic"`/`"analysis"` match the literal value + * only. + */ + request_source?: RequestSource | "all"; /** * Optional inclusive date range. When both `from` and `to` are set the * underlying queries filter on `created_at >= from AND created_at <= to` - * instead of the default `NOW() - INTERVAL ' days'` window. + * instead of the default rolling window — a UTC-calendar-day-aligned + * `created_at >= (NOW() AT TIME ZONE 'UTC')::date - (LEAST(days, cap) - 1)` + * (see {@link buildDateWindow}), or no lower bound at all when + * `days >= ALL_TIME_DAYS`. * * Callers should ensure both are provided together. Endpoints reject * half-specified ranges, calendar-invalid dates (e.g. Feb 30), @@ -139,10 +269,16 @@ export async function logQuery( ): Promise { const pool = getPool(); const text = logQueryText ? entry.query_text : REDACTED_QUERY_TEXT; + // Coerce the request source to a known value at the write boundary so the + // column only ever holds user|synthetic|analysis going forward (an absent or + // unrecognized tag becomes DEFAULT_REQUEST_SOURCE = 'user'). Historical rows + // written before this column existed stay NULL and are read back as real + // users by the analytics layer. + const requestSource = normalizeRequestSource(entry.request_source); try { await pool.query( - `INSERT INTO query_log (tool_name, query_text, result_count, top_score, latency_ms, source_name, session_id) - VALUES ($1, $2, $3, $4, $5, $6, $7)`, + `INSERT INTO query_log (tool_name, query_text, result_count, top_score, latency_ms, source_name, session_id, request_source) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, [ entry.tool_name, text, @@ -151,6 +287,7 @@ export async function logQuery( entry.latency_ms, entry.source_name, entry.session_id, + requestSource, ], ); } catch (err) { @@ -229,6 +366,53 @@ function whereAnd(baseClauses: string[], filterClauses: string[]): string { return all.length > 0 ? "WHERE " + all.join(" AND ") : ""; } +/** + * Build the request-source WHERE fragment + params for a reader. + * + * Semantics (see {@link AnalyticsFilter.request_source}): + * - `undefined` → default to real users: + * `(request_source IN ('user') OR request_source IS NULL)`. NULL is folded + * in so rows predating the column (which have no tag) still count as real + * user traffic — this is the back-compat guarantee. + * - `"all"` → no clause (every row, regardless of origin). + * - `"user"` → same as the default (real users incl. NULL). + * - `"synthetic"` | `"analysis"` → exact-match on the literal value + * (`request_source = $N`); NULL rows are NOT synthetic/analysis so they're + * excluded. + * + * Returns `{ clauses, params, nextIdx }` shaped like {@link buildFilterClauses} + * so callers can splice it into their base clauses + param list uniformly. + */ +function buildRequestSourceClause( + filter: AnalyticsFilter, + startIdx: number, +): { clauses: string[]; params: unknown[]; nextIdx: number } { + const rs = filter.request_source; + + if (rs === "all") { + return { clauses: [], params: [], nextIdx: startIdx }; + } + + // Default (undefined) and explicit "user" both mean real users, which + // includes the untagged historical rows (request_source IS NULL). + if (rs === undefined || rs === "user") { + return { + clauses: [`(request_source = $${startIdx} OR request_source IS NULL)`], + params: ["user"], + nextIdx: startIdx + 1, + }; + } + + // Specific non-user origin: exact literal match. NULL rows are real users, + // not synthetic/analysis, so the bare equality (which is NULL-rejecting in + // SQL three-valued logic) correctly excludes them. + return { + clauses: [`request_source = $${startIdx}`], + params: [rs], + nextIdx: startIdx + 1, + }; +} + /** * Rolling-window cap, applied to every days-based windowed aggregate * (summary totals, latency, by-source, per-day, top/empty-queries, tool @@ -237,9 +421,13 @@ function whereAnd(baseClauses: string[], filterClauses: string[]): string { * with no UI benefit. Other aggregates share the cap so `days=1000` on a * rolling window produces consistent results across every card on the * dashboard. Explicit from/to ranges are user-chosen bounds and pass - * through uncapped. + * through uncapped (the per-day series width is still capped — see + * {@link buildPerDayWindow}). The all-time sentinel + * ({@link ALL_TIME_DAYS}) bypasses this cap for the summary/aggregate window + * entirely (no lower bound). Exported so the per-day cap and tests reference + * the same constant. */ -const ROLLING_WINDOW_CAP_DAYS = 366; +export const ROLLING_WINDOW_CAP_DAYS = 366; /** * Build a date-window clause + params for the given filter, falling back to @@ -247,6 +435,10 @@ const ROLLING_WINDOW_CAP_DAYS = 366; * * - When `filter.from` and `filter.to` are both set, returns * `created_at >= $N AND created_at <= $N+1` with the two Date params. + * - When `days >= ALL_TIME_DAYS` (the "All time" sentinel) and no from/to is + * set, returns NO clause and binds NO params — the window has no lower + * bound, so the summary/aggregate cards span every row regardless of how + * far back history goes (the rolling cap does not apply to "all time"). * - Otherwise, returns a UTC-calendar-day-bounded rolling window: * `created_at >= (NOW() AT TIME ZONE 'UTC')::date - (LEAST($N, 366) - 1)` * with the `days` number param. Rolling mode semantics: "last N days" @@ -277,6 +469,14 @@ function buildDateWindow( nextIdx: startIdx + 2, }; } + // "All time": omit the lower bound entirely so the summary/aggregate cards + // cover every row, not just the last ROLLING_WINDOW_CAP_DAYS. No clause and + // no param are emitted, so the caller's other clauses (latency_ms >= 0, + // redacted filter, request-source) still apply unchanged and the `$` + // placeholder numbering carries on from startIdx untouched. + if (days >= ALL_TIME_DAYS) { + return { clauses: [], params: [], nextIdx: startIdx }; + } // Rolling: UTC-calendar-day-aligned. LEAST caps the span at // ROLLING_WINDOW_CAP_DAYS so `days=huge` can't produce an unbounded // subtraction. `-1` gives an inclusive N-day window ending today @@ -304,12 +504,20 @@ function buildDateWindow( * my summary card"; reusing buildDateWindow removes the whole class of bug. * * The SERIES is emitted separately as UTC-midnight calendar days so that the - * LEFT JOIN renders one bar per day even when no rows were logged. Rolling - * mode caps the series at ROLLING_WINDOW_CAP_DAYS (shared with - * buildDateWindow) so a huge `days` value can't bloat the payload. Range - * mode passes through uncapped (user explicitly chose bounds) and forces - * UTC on the series ::date cast so a non-UTC session TimeZone GUC can't - * shift the series one day earlier than intended. + * LEFT JOIN renders one bar per day even when no rows were logged. The series + * width is ALWAYS capped at ROLLING_WINDOW_CAP_DAYS so the payload can never + * exceed a year of daily bars: + * - Rolling mode caps via `LEAST($days, cap)` (shared with buildDateWindow), + * so sum-of-bars == total_queries_window exactly. + * - Range mode caps the UPPER bound at `from + (cap - 1)` days; the summary + * aggregate still spans the full user-chosen range, but the chart only + * renders the first `cap` days of it (a multi-thousand-day range would + * otherwise emit one JSON row per day — payload bloat / DoS). + * - All-time mode (days >= ALL_TIME_DAYS) emits a literal `cap`-day series + * ending today and binds NO placeholder (buildDateWindow returns no date + * clause for all-time, so the inner aggregate counts every row). + * All three force UTC on the series ::date cast so a non-UTC session TimeZone + * GUC can't shift the series one day earlier than intended. * * `startIdx` is the next available `$` placeholder index; `nextIdx` is the * next index to use after this helper's params. @@ -326,23 +534,46 @@ function buildPerDayWindow( } { // Inner WHERE is byte-identical to the summary window. For rolling mode // buildDateWindow binds $startIdx = days; for range mode it binds - // $startIdx = from and $startIdx+1 = to. The series below reuses those - // exact placeholders so the two sides can never drift apart. + // $startIdx = from and $startIdx+1 = to; for all-time it binds nothing. + // The series below reuses those exact placeholders so the two sides can + // never drift apart. const dw = buildDateWindow(filter, days, startIdx); const whereClause = dw.clauses.join(" AND "); if (filter.from && filter.to) { + // Range: lower bound is `($from AT TIME ZONE 'UTC')::date`; upper bound is + // capped at `lower + (cap - 1)` days via LEAST so the series width never + // exceeds ROLLING_WINDOW_CAP_DAYS even for an enormous range (the endpoint + // allows a from/to span up to MAX_DAYS, which would otherwise emit ~100k + // daily rows). The `AT TIME ZONE 'UTC'` forces the timestamptz to be + // interpreted in UTC before the ::date cast; without it, a non-UTC session + // TimeZone GUC (common on managed Postgres that inherit a regional + // default) coerces `'2026-04-15T00:00:00Z'::date` to the session-local day + // `2026-04-14`, shifting the series one day earlier than the caller + // intended. Combined with the UTC-normalized date_trunc in the inner + // aggregate (see getAnalyticsSummary), this keeps bars aligned regardless + // of TZ. The summary/aggregate WHERE (dw.clauses) still spans the FULL + // range — only the chart series is capped. + const fromDate = `($${startIdx}::timestamptz AT TIME ZONE 'UTC')::date`; + const toDate = `($${startIdx + 1}::timestamptz AT TIME ZONE 'UTC')::date`; + const cappedTo = `LEAST(${toDate}, ${fromDate} + (${ROLLING_WINDOW_CAP_DAYS} - 1))`; return { - // Range: `($from AT TIME ZONE 'UTC')::date .. ($to AT TIME ZONE - // 'UTC')::date`. The `AT TIME ZONE 'UTC'` forces the timestamptz to - // be interpreted in UTC before the ::date cast; without it, a - // non-UTC session TimeZone GUC (common on managed Postgres that - // inherit a regional default) coerces `'2026-04-15T00:00:00Z'::date` - // to the session-local day `2026-04-14`, shifting the series one - // day earlier than the caller intended. Combined with the UTC- - // normalized date_trunc in the inner aggregate (see - // getAnalyticsSummary), this keeps bars aligned regardless of TZ. - seriesExpr: `generate_series(($${startIdx}::timestamptz AT TIME ZONE 'UTC')::date, ($${startIdx + 1}::timestamptz AT TIME ZONE 'UTC')::date, '1 day'::interval)`, + seriesExpr: `generate_series(${fromDate}, ${cappedTo}, '1 day'::interval)`, + whereClause, + params: dw.params, + nextIdx: dw.nextIdx, + }; + } + + if (days >= ALL_TIME_DAYS) { + // All-time. buildDateWindow emitted no date clause (whereClause is ""), + // so the inner aggregate counts every row. The series is a literal + // cap-day window ending today — bounded so years of history don't bloat + // the chart — and binds no placeholder. Rows older than the cap are still + // counted by the summary cards (no lower bound there); they simply don't + // render as bars. + return { + seriesExpr: `generate_series((NOW() AT TIME ZONE 'UTC')::date - (${ROLLING_WINDOW_CAP_DAYS} - 1), (NOW() AT TIME ZONE 'UTC')::date, '1 day'::interval)`, whereClause, params: dw.params, nextIdx: dw.nextIdx, @@ -374,9 +605,15 @@ function buildPerDayWindow( * PGlite does NOT support percentile_cont(), so we fetch all latencies * and compute the percentile in JS. * - * Uses nearest-rank (index = floor(n * 0.95)), not linear interpolation — - * results may differ by one sample from Postgres' `percentile_cont(0.95)` - * query. + * Index = `floor(n * 0.95)`, clamped to the last element. This is NOT the + * standard nearest-rank method (`ceil(0.95 * n) - 1`): for small n the two + * differ, and `floor` skews toward the high end. Concretely, for n = 20 + * `floor(20 * 0.95) = 19`, which selects the MAXIMUM sample (index 19), + * whereas nearest-rank would pick index 18. We keep this behavior + * deliberately — the dashboard has been calibrated against it and the + * difference is at most one sample — and the boundary is pinned by a test so + * the comment and behavior cannot drift. Either way, results may differ by + * one sample from Postgres' `percentile_cont(0.95)` (which interpolates). */ function computeP95(latencies: number[]): number { if (latencies.length === 0) return 0; @@ -437,21 +674,41 @@ export async function getAnalyticsSummary( // the divergence is deliberate. const { clauses: fc2, params: fp2, nextIdx: n2 } = buildFilterClauses(filter); const dw2 = buildDateWindow(filter, days, n2); - const redactedIdx2 = dw2.nextIdx; + const rs2 = buildRequestSourceClause(filter, dw2.nextIdx); + const redactedIdx2 = rs2.nextIdx; + const lowConfIdx2 = redactedIdx2 + 1; const summaryBase = [ ...dw2.clauses, + ...rs2.clauses, "latency_ms >= 0", `query_text != $${redactedIdx2}`, ]; const summaryWhere = whereAnd(summaryBase, fc2); + // low_confidence shares this subquery (and therefore the exact same + // population as total/empty/avg_latency) via a FILTER, so the + // low_confidence_rate denominator lines up with total_queries_window. The + // threshold is bound, not inlined, so LOW_CONFIDENCE_SCORE_THRESHOLD stays + // the single source of truth. `top_score IS NOT NULL` is part of the FILTER + // so NULL-score rows (browse/keyword) never count as low confidence. const summaryRes = await pool.query( `SELECT count(*)::int AS total, count(*) FILTER (WHERE result_count = 0)::int AS empty, + count(*) FILTER ( + WHERE result_count > 0 + AND top_score IS NOT NULL + AND top_score < $${lowConfIdx2} + )::int AS low_confidence, COALESCE(avg(latency_ms)::int, 0) AS avg_latency FROM query_log ${summaryWhere}`, - [...fp2, ...dw2.params, REDACTED_QUERY_TEXT], + [ + ...fp2, + ...dw2.params, + ...rs2.params, + REDACTED_QUERY_TEXT, + LOW_CONFIDENCE_SCORE_THRESHOLD, + ], ); // Latencies for p95 (exclude backfilled rows where latency_ms < 0, AND @@ -466,9 +723,11 @@ export async function getAnalyticsSummary( // reading is known to be sampled rather than exact. const { clauses: fc3, params: fp3, nextIdx: n3 } = buildFilterClauses(filter); const dw3 = buildDateWindow(filter, days, n3); - const redactedIdxLatency = dw3.nextIdx; + const rs3 = buildRequestSourceClause(filter, dw3.nextIdx); + const redactedIdxLatency = rs3.nextIdx; const latencyBase = [ ...dw3.clauses, + ...rs3.clauses, "latency_ms >= 0", `query_text != $${redactedIdxLatency}`, ]; @@ -481,7 +740,13 @@ export async function getAnalyticsSummary( // slice back to the cap for the actual p95 computation. const latencyRes = await pool.query( `SELECT latency_ms FROM query_log ${latencyWhere} ORDER BY random() LIMIT $${latencyLimitIdx}`, - [...fp3, ...dw3.params, REDACTED_QUERY_TEXT, P95_LATENCY_ROW_CAP + 1], + [ + ...fp3, + ...dw3.params, + ...rs3.params, + REDACTED_QUERY_TEXT, + P95_LATENCY_ROW_CAP + 1, + ], ); const p95Sampled = latencyRes.rows.length > P95_LATENCY_ROW_CAP; if (p95Sampled) { @@ -494,9 +759,11 @@ export async function getAnalyticsSummary( // doughnut totals line up with summary + per-day. const { clauses: fc4, params: fp4, nextIdx: n4 } = buildFilterClauses(filter); const dw4 = buildDateWindow(filter, days, n4); + const rs4 = buildRequestSourceClause(filter, dw4.nextIdx); const sourceBase = [ "source_name IS NOT NULL", ...dw4.clauses, + ...rs4.clauses, "latency_ms >= 0", ]; const sourceWhere = whereAnd(sourceBase, fc4); @@ -506,7 +773,7 @@ export async function getAnalyticsSummary( ${sourceWhere} GROUP BY source_name ORDER BY count DESC`, - [...fp4, ...dw4.params], + [...fp4, ...dw4.params, ...rs4.params], ); // Per day (filtered). LEFT JOIN against generate_series so every day in @@ -526,9 +793,14 @@ export async function getAnalyticsSummary( // zero-count days the gap-fill exists to surface. const { clauses: fc5, params: fp5, nextIdx: n5 } = buildFilterClauses(filter); const pdw = buildPerDayWindow(filter, days, n5); - const redactedIdx5 = pdw.nextIdx; + const rs5 = buildRequestSourceClause(filter, pdw.nextIdx); + const redactedIdx5 = rs5.nextIdx; + // pdw.whereClause is empty in all-time mode (buildDateWindow omits the date + // bound). Drop the empty fragment so whereAnd doesn't splice a dangling + // `AND` into the inner aggregate's WHERE. const dayBase = [ - pdw.whereClause, + ...(pdw.whereClause ? [pdw.whereClause] : []), + ...rs5.clauses, "latency_ms >= 0", `query_text != $${redactedIdx5}`, ]; @@ -553,7 +825,7 @@ export async function getAnalyticsSummary( GROUP BY day ) q ON q.day = d.day::date ORDER BY d.day`, - [...fp5, ...pdw.params, REDACTED_QUERY_TEXT], + [...fp5, ...pdw.params, ...rs5.params, REDACTED_QUERY_TEXT], ); // Earliest query day (UNFILTERED — the UI uses this to label windows @@ -573,10 +845,26 @@ export async function getAnalyticsSummary( `SELECT min(created_at AT TIME ZONE 'UTC')::date::text AS earliest_day FROM query_log`, ); - const totalQueries = totalRes.rows[0]?.count ?? 0; + // Coerce a DB numeric to a finite JS number, defaulting to 0. The summary + // columns are `::int`-cast in SQL, but PGlite and node-postgres disagree on + // whether integer/numeric columns deserialize as `number` or `string` + // (node-postgres returns `bigint`/`numeric` as strings). Trusting the driver + // typing risks a string leaking into total_queries_window or a NaN into the + // *_rate fields when the value is unexpectedly non-numeric. getTopQueries + // already guards its parseFloat results this way; mirror it here so the + // summary numerics stay consistent. Defensive given the `::int` casts — + // intentionally minimal. + const toFiniteNumber = (v: unknown): number => { + const n = Number(v); + return Number.isFinite(n) ? n : 0; + }; + + const totalQueries = toFiniteNumber(totalRes.rows[0]?.count); const s = summaryRes.rows[0] ?? {}; - const totalWindow = s.total ?? 0; - const emptyWindow = s.empty ?? 0; + const totalWindow = toFiniteNumber(s.total); + const emptyWindow = toFiniteNumber(s.empty); + const lowConfidenceWindow = toFiniteNumber(s.low_confidence); + const avgLatencyWindow = toFiniteNumber(s.avg_latency); // Normalize undefined (truly missing) to null so consumers get a // consistent shape regardless of whether the DB returned an empty // row, a row with NULL, or no row at all. @@ -584,10 +872,14 @@ export async function getAnalyticsSummary( (earliestRes.rows[0]?.earliest_day as string | null | undefined) ?? null; // Compute p95 in application code. Slice back to the cap so the extra - // "overflow probe" row fetched above doesn't skew the sample size. + // "overflow probe" row fetched above doesn't skew the sample size. Coerce + // each latency through toFiniteNumber: node-postgres deserializes a numeric + // column as a STRING, so trusting `as number` here would leave computeP95 + // sorting/returning strings (wrong/NaN p95). Mirrors the summary numerics + // above which already coerce defensively. const latencies = latencyRes.rows .slice(0, P95_LATENCY_ROW_CAP) - .map((r: Record) => r.latency_ms as number); + .map((r: Record) => toFiniteNumber(r.latency_ms)); const p95Latency = computeP95(latencies); return { @@ -595,7 +887,10 @@ export async function getAnalyticsSummary( total_queries_window: totalWindow, empty_result_count_window: emptyWindow, empty_result_rate_window: totalWindow > 0 ? emptyWindow / totalWindow : 0, - avg_latency_ms_window: s.avg_latency ?? 0, + low_confidence_count_window: lowConfidenceWindow, + low_confidence_rate_window: + totalWindow > 0 ? lowConfidenceWindow / totalWindow : 0, + avg_latency_ms_window: avgLatencyWindow, p95_latency_ms_window: p95Latency, // Only set when the cap was actually hit so existing consumers (tests, // older UI builds) can treat the absence of the flag as "exact". @@ -633,12 +928,14 @@ export async function getTopQueries( const { clauses: fc, params: fp, nextIdx } = buildFilterClauses(filter); const dw = buildDateWindow(filter, days, nextIdx); + const rs = buildRequestSourceClause(filter, dw.nextIdx); // Bind REDACTED_QUERY_TEXT rather than interpolating the literal so the // sentinel has a single source of truth (the module constant) and the // SQL stays shielded from the value. - const redactedIdx = dw.nextIdx; + const redactedIdx = rs.nextIdx; const baseClauses = [ ...dw.clauses, + ...rs.clauses, `query_text != $${redactedIdx}`, "latency_ms >= 0", ]; @@ -657,7 +954,7 @@ export async function getTopQueries( HAVING bool_or(result_count > 0) ORDER BY count DESC LIMIT $${redactedIdx + 1}`, - [...fp, ...dw.params, REDACTED_QUERY_TEXT, limit], + [...fp, ...dw.params, ...rs.params, REDACTED_QUERY_TEXT, limit], ); return rows.map((r: Record) => { @@ -684,6 +981,13 @@ export async function getTopQueries( * Get queries that returned zero results. Grouped by * (query_text, tool_name, source_name); results with the same query text * but different tool/source appear separately. + * + * Both synthetic sentinels are excluded: REDACTED_QUERY_TEXT (log_queries: + * false) and BROWSE_QUERY_TEXT (the knowledge-tool browse path, which logs + * "" for an empty-query "return all FAQ entries" call). A browse call + * that happens to return zero entries is not a real "user searched and found + * nothing" gap, so surfacing a literal `` row in the Empty-Result + * dashboard would be misleading noise. */ export async function getEmptyQueries( days: number = 7, @@ -694,13 +998,19 @@ export async function getEmptyQueries( const { clauses: fc, params: fp, nextIdx } = buildFilterClauses(filter); const dw = buildDateWindow(filter, days, nextIdx); - // Same rationale as getTopQueries: bind the REDACTED_QUERY_TEXT sentinel - // so the SQL literal isn't duplicated across reads. - const redactedIdx = dw.nextIdx; + const rs = buildRequestSourceClause(filter, dw.nextIdx); + // Same rationale as getTopQueries: bind the sentinels so the SQL literals + // aren't duplicated across reads. Both REDACTED_QUERY_TEXT and + // BROWSE_QUERY_TEXT are filtered out (see JSDoc). + const redactedIdx = rs.nextIdx; + const browseIdx = redactedIdx + 1; + const limitIdx = browseIdx + 1; const baseClauses = [ "result_count = 0", ...dw.clauses, + ...rs.clauses, `query_text != $${redactedIdx}`, + `query_text != $${browseIdx}`, "latency_ms >= 0", ]; const where = whereAnd(baseClauses, fc); @@ -716,8 +1026,15 @@ export async function getEmptyQueries( ${where} GROUP BY query_text, tool_name, source_name ORDER BY count DESC - LIMIT $${redactedIdx + 1}`, - [...fp, ...dw.params, REDACTED_QUERY_TEXT, limit], + LIMIT $${limitIdx}`, + [ + ...fp, + ...dw.params, + ...rs.params, + REDACTED_QUERY_TEXT, + BROWSE_QUERY_TEXT, + limit, + ], ); return rows.map((r: Record) => ({ @@ -764,9 +1081,13 @@ export async function getToolCounts( nextIdx, } = buildFilterClauses(sourceOnlyFilter); const dw = buildDateWindow(sourceOnlyFilter, days, nextIdx); + // request_source survives in `rest` (only tool_type is stripped above), so + // the tool-counts donut honors the same real-users-by-default behavior as + // every other windowed aggregate. + const rs = buildRequestSourceClause(sourceOnlyFilter, dw.nextIdx); // Exclude backfilled rows (latency_ms < 0) so tool counts match the // windowed aggregates used elsewhere (summary, latency, per-day). - const baseClauses = [...dw.clauses, "latency_ms >= 0"]; + const baseClauses = [...dw.clauses, ...rs.clauses, "latency_ms >= 0"]; const where = whereAnd(baseClauses, fc); const { rows } = await pool.query( `SELECT @@ -776,7 +1097,7 @@ export async function getToolCounts( ${where} GROUP BY tool_type ORDER BY count DESC`, - [...fp, ...dw.params], + [...fp, ...dw.params, ...rs.params], ); return rows.map((r: Record) => ({ diff --git a/src/db/queries.ts b/src/db/queries.ts index ae58ab2..8f3ce68 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -8,6 +8,23 @@ import type { IndexStatus, } from "../types.js"; +/** + * Coerce a DB-returned value to a finite JS number, defaulting to 0. Mirrors + * the `toFiniteNumber` discipline in analytics.ts (getAnalyticsSummary): + * node-postgres deserializes numeric/`count(*)::int` columns as STRINGS (and + * `Number()` of a non-numeric value such as "high" or undefined yields NaN), + * so trusting `as number` / a raw `Number()` risks a string or NaN leaking into + * similarity sort order, top_score, or the index-stats counts. The + * `Number.isFinite` guard maps any NaN (and ±Infinity) back to 0. (`Number()` + * also coerces "" and null to 0, which is the desired default here.) Replicated + * here rather than imported to avoid coupling queries.ts to an + * analytics-internal closure. + */ +function toFiniteNumber(v: unknown): number { + const n = Number(v); + return Number.isFinite(n) ? n : 0; +} + // --------------------------------------------------------------------------- // Search // --------------------------------------------------------------------------- @@ -73,7 +90,9 @@ export async function searchChunks( start_line: (r.start_line as number) ?? null, end_line: (r.end_line as number) ?? null, language: (r.language as string) ?? null, - similarity: parseFloat(r.similarity as string), + // Coerce to a finite number: a non-numeric similarity would Number() to + // NaN and corrupt the similarity sort order / top_score downstream. + similarity: toFiniteNumber(r.similarity), })); } @@ -146,7 +165,9 @@ export async function textSearchChunks( start_line: (r.start_line as number) ?? null, end_line: (r.end_line as number) ?? null, language: (r.language as string) ?? null, - similarity: parseFloat(r.similarity as string), + // Coerce to a finite number: a non-numeric similarity would Number() to + // NaN and corrupt the similarity sort order / top_score downstream. + similarity: toFiniteNumber(r.similarity), })); } @@ -159,8 +180,12 @@ export async function textSearchChunks( * This is faster than a single SQL query because each query uses its * respective index (HNSW for vector, GIN for tsvector). * - * min_score filtering is applied to vector candidates BEFORE merging, - * preserving the semantic quality floor per the spec. + * min_score gates ONLY the vector candidates, and does so BEFORE the RRF + * merge. It is a cosine-similarity floor, so it is meaningful only for the + * vector list; the keyword list has no comparable score. A hit that surfaces + * via keyword search but is NOT in the surviving vector set therefore enters + * the fused output UNGATED by min_score — min_score raises the semantic floor + * of the vector contribution, it does not filter keyword-only matches. */ export async function hybridSearchChunks( embedding: number[], @@ -179,7 +204,9 @@ export async function hybridSearchChunks( textSearchChunks(queryText, candidateLimit, sourceName, version), ]); - // Apply min_score filter to vector candidates before merging + // Apply min_score to the VECTOR candidates only, before merging. Keyword-only + // hits (present in keywordResults but not in the surviving vector set) are not + // score-gated here — they still enter the RRF merge below. const filteredVectorResults = minScore != null ? vectorResults.filter((r) => r.similarity >= minScore) @@ -252,19 +279,11 @@ export function rrfMerge( // --------------------------------------------------------------------------- /** - * Batch upsert chunks. Uses ON CONFLICT to update existing rows matched by - * (source_name, file_path, chunk_index). + * SQL for inserting a single chunk row, updating in place on the + * (source_name, file_path, chunk_index) conflict. Shared by upsertChunks and + * replaceChunksForFile so the column list and tsv derivation stay in lockstep. */ -export async function upsertChunks(chunks: Chunk[]): Promise { - if (chunks.length === 0) return; - - const pool = getPool(); - const client = await pool.connect(); - - try { - await client.query("BEGIN"); - - const sql = ` +const INSERT_CHUNK_SQL = ` INSERT INTO chunks (source_name, source_url, title, content, embedding, repo_url, file_path, start_line, end_line, language, chunk_index, @@ -288,28 +307,90 @@ export async function upsertChunks(chunks: Chunk[]): Promise { tsv = EXCLUDED.tsv `; +/** Positional params for INSERT_CHUNK_SQL, in column order. */ +function chunkInsertParams(chunk: Chunk): unknown[] { + return [ + chunk.source_name, + chunk.source_url ?? null, + chunk.title ?? null, + chunk.content, + pgvector.toSql(chunk.embedding), + chunk.repo_url, + chunk.file_path, + chunk.start_line ?? null, + chunk.end_line ?? null, + chunk.language ?? null, + chunk.chunk_index, + JSON.stringify(chunk.metadata ?? {}), + chunk.commit_sha ?? null, + chunk.version ?? null, + ]; +} + +/** + * Atomically replace all chunks for a (source_name, file_path) pair: delete the + * file's existing chunks and insert the new set on a SINGLE pooled client inside + * one BEGIN/COMMIT, rolling back on any error. + * + * This is the durable form of "delete old chunks, then upsert new ones". Running + * the DELETE and the INSERTs as separate awaits risks permanent data loss: if an + * INSERT throws after the DELETE has committed, the file is left with zero chunks + * (and the caller typically advances its index-state token, so the gap is never + * re-filled). Wrapping both in a transaction guarantees the pre-existing chunks + * survive intact when any insert fails. + * + * Passing an empty `chunks` array performs the delete only (used to drop a file + * that no longer produces any chunks). + */ +export async function replaceChunksForFile( + sourceName: string, + filePath: string, + chunks: Chunk[], +): Promise { + const pool = getPool(); + const client = await pool.connect(); + + try { + await client.query("BEGIN"); + await client.query( + "DELETE FROM chunks WHERE source_name = $1 AND file_path = $2", + [sourceName, filePath], + ); + for (const chunk of chunks) { + await client.query(INSERT_CHUNK_SQL, chunkInsertParams(chunk)); + } + await client.query("COMMIT"); + } catch (err) { + // Swallow a ROLLBACK rejection (e.g. dead connection) so it can't mask the + // ORIGINAL error — that error is the real cause and must reach the caller. + await client.query("ROLLBACK").catch(() => {}); + throw err; + } finally { + client.release(); + } +} + +/** + * Batch upsert chunks. Uses ON CONFLICT to update existing rows matched by + * (source_name, file_path, chunk_index). + */ +export async function upsertChunks(chunks: Chunk[]): Promise { + if (chunks.length === 0) return; + + const pool = getPool(); + const client = await pool.connect(); + + try { + await client.query("BEGIN"); + for (const chunk of chunks) { - await client.query(sql, [ - chunk.source_name, - chunk.source_url ?? null, - chunk.title ?? null, - chunk.content, - pgvector.toSql(chunk.embedding), - chunk.repo_url, - chunk.file_path, - chunk.start_line ?? null, - chunk.end_line ?? null, - chunk.language ?? null, - chunk.chunk_index, - JSON.stringify(chunk.metadata ?? {}), - chunk.commit_sha ?? null, - chunk.version ?? null, - ]); + await client.query(INSERT_CHUNK_SQL, chunkInsertParams(chunk)); } await client.query("COMMIT"); } catch (err) { - await client.query("ROLLBACK"); + // Swallow a ROLLBACK rejection so it can't mask the original error. + await client.query("ROLLBACK").catch(() => {}); throw err; } finally { client.release(); @@ -542,8 +623,16 @@ export async function getWebhookDeliveryStats(): Promise<{ const byDecision: Record = {}; let total = 0; for (const row of countsResult.rows) { - byDecision[row.decision as string] = row.count as number; - total += row.count as number; + // Coerce through toFiniteNumber: node-postgres deserializes count(*)::int as + // a STRING, and by_decision is declared Record and + // serialized into the /health endpoint, so storing the raw string emits + // {"accept":"5"} — a user-facing type violation. Mirrors the toFiniteNumber + // discipline used by every sibling count site (getIndexStats, analytics.ts). + byDecision[row.decision as string] = toFiniteNumber(row.count); + // The total likewise coerces before accumulating: a driver returning the + // count as a string would make `0 + "5"` evaluate to "05" (string concat), + // corrupting the total. + total += toFiniteNumber(row.count); } const lastRow = lastResult.rows[0]; @@ -638,7 +727,11 @@ export async function getAllChunksForLlms(): Promise< /** * Fetch FAQ chunks filtered by source name and minimum confidence. * Confidence is stored in chunk metadata JSONB; this query extracts and filters it. - * Results are ordered by source_name, then indexed_at DESC (most recent first). + * Results are ordered by indexed_at DESC, then id DESC — i.e. global recency + * across all queried sources. source_name deliberately does NOT lead the + * ordering: a leading source_name would let a global LIMIT be consumed entirely + * by the alphabetically-first source, starving more-recent rows from later + * sources. */ export async function getFaqChunks( sourceNames: string[], @@ -653,6 +746,14 @@ export async function getFaqChunks( const placeholders = sourceNames.map((_, i) => `$${i + 1}`).join(", "); const confidenceParam = sourceNames.length + 1; + // Guard BOTH confidence casts with jsonb_typeof so a row whose `confidence` + // KEY exists but holds non-numeric text (e.g. "high") degrades to 0.0 instead + // of raising `invalid input syntax for type double precision` and crashing the + // whole browse listing. `metadata ? 'confidence'` only checks key presence — + // it does NOT guarantee the value is a number — so the raw `::float` cast in + // the projection AND the WHERE comparison could each crash on a single bad + // row. Mirrors the CASE guard in getFaqChunksByIds. A degraded 0.0 row is + // correctly excluded by any positive minConfidence threshold. let sql = ` SELECT id, @@ -667,12 +768,20 @@ export async function getFaqChunks( language, 0.0 AS similarity, metadata, - COALESCE((metadata->>'confidence')::float, 0.0) AS confidence + CASE + WHEN jsonb_typeof(metadata->'confidence') = 'number' + THEN (metadata->>'confidence')::float + ELSE 0.0 + END AS confidence FROM chunks WHERE source_name IN (${placeholders}) AND metadata ? 'confidence' - AND (metadata->>'confidence')::float >= $${confidenceParam} - ORDER BY source_name, indexed_at DESC + AND CASE + WHEN jsonb_typeof(metadata->'confidence') = 'number' + THEN (metadata->>'confidence')::float + ELSE 0.0 + END >= $${confidenceParam} + ORDER BY indexed_at DESC, id DESC `; const params: unknown[] = [...sourceNames, minConfidence]; @@ -694,9 +803,87 @@ export async function getFaqChunks( start_line: (r.start_line as number) ?? null, end_line: (r.end_line as number) ?? null, language: (r.language as string) ?? null, - similarity: parseFloat(r.similarity as string), + // Coerce to a finite number: a non-numeric similarity string would Number() + // to NaN and corrupt sort order / top_score. Same guard as confidence below. + similarity: toFiniteNumber(r.similarity), + metadata: (r.metadata as Record) ?? {}, + // Coerce to a finite number for the same reason as similarity above: + // toFiniteNumber maps a non-numeric confidence string (Number(...)=NaN) back + // to 0, while null/'' already Number() to 0 — either way confidence stays a + // finite number so threshold comparisons / sort order are not corrupted. + confidence: toFiniteNumber(r.confidence), + })); +} + +/** + * Fetch FAQ metadata (including extracted confidence) for an EXACT set of chunk + * ids. Unlike getFaqChunks, this does NOT order by indexed_at or apply a top-N + * window — the caller has already ranked the ids (e.g. by vector similarity) and + * needs the FAQ confidence/metadata for precisely those rows. + * + * This exists because cross-referencing similarity hits against an + * indexed_at-DESC top-N window silently drops a relevant hit whose id falls + * outside that recency window. Looking up by id keeps every ranked hit. + * + * Returns rows in arbitrary order; the caller re-associates them by id and + * applies its own confidence threshold. Empty input → no query, empty result. + */ +export async function getFaqChunksByIds( + ids: number[], +): Promise { + if (ids.length === 0) return []; + + const pool = getPool(); + // Guard the confidence cast: getFaqChunksByIds looks up an EXACT id set with + // no `metadata ? 'confidence'` WHERE filter (unlike getFaqChunks), so a + // single row whose confidence is non-numeric text (e.g. "high") would raise + // `invalid input syntax for type double precision` and reject the WHOLE + // knowledge lookup. The jsonb_typeof check casts only genuine JSON numbers + // and degrades any malformed/missing value to 0.0 so one bad row can't crash + // the search. + const sql = ` + SELECT + id, + source_name, + source_url, + title, + content, + repo_url, + file_path, + start_line, + end_line, + language, + 0.0 AS similarity, + metadata, + CASE + WHEN jsonb_typeof(metadata->'confidence') = 'number' + THEN (metadata->>'confidence')::float + ELSE 0.0 + END AS confidence + FROM chunks + WHERE id = ANY($1) + `; + + const { rows } = await pool.query(sql, [ids]); + return rows.map((r: Record) => ({ + id: r.id as number, + source_name: r.source_name as string, + source_url: (r.source_url as string) ?? null, + title: (r.title as string) ?? null, + content: r.content as string, + repo_url: (r.repo_url as string) ?? null, + file_path: r.file_path as string, + start_line: (r.start_line as number) ?? null, + end_line: (r.end_line as number) ?? null, + language: (r.language as string) ?? null, + // Coerce to a finite number: a non-numeric similarity would Number() to NaN + // and corrupt sort order / top_score. Same guard as confidence below. + similarity: toFiniteNumber(r.similarity), metadata: (r.metadata as Record) ?? {}, - confidence: parseFloat(r.confidence as string) || 0.0, + // Coerce to a finite number for the same reason as similarity above: a + // null/non-numeric confidence would otherwise yield NaN and corrupt + // threshold comparisons / sort order. + confidence: toFiniteNumber(r.confidence), })); } @@ -719,13 +906,18 @@ export async function getIndexStats(): Promise { ), ]); + // Coerce the counts through toFiniteNumber: although each is `count(*)::int` + // in SQL, node-postgres deserializes integer/numeric columns as STRINGS, so + // `as number` / `?? 0` would let a string leak into totalChunks/indexedRepos + // (and `?? 0` only catches null/undefined, not a "0" string). Mirrors the + // same discipline in getAnalyticsSummary. return { - totalChunks: totalCount.rows[0]?.count ?? 0, + totalChunks: toFiniteNumber(totalCount.rows[0]?.count), bySource: bySource.rows.map((r: Record) => ({ source_name: r.source_name as string, - count: r.count as number, + count: toFiniteNumber(r.count), })), - indexedRepos: repoCount.rows[0]?.count ?? 0, + indexedRepos: toFiniteNumber(repoCount.rows[0]?.count), indexStates: states.rows.map((r: Record) => ({ source_type: r.source_type as string, source_key: r.source_key as string, diff --git a/src/db/schema.ts b/src/db/schema.ts index 2127232..8442507 100644 --- a/src/db/schema.ts +++ b/src/db/schema.ts @@ -65,13 +65,19 @@ DROP TABLE IF EXISTS code_chunks CASCADE; } /** - * Generate post-schema migration SQL for columns added after initial release. + * Generate post-schema migration SQL for objects added after initial release. * Safe to run repeatedly — uses IF NOT EXISTS / ADD COLUMN IF NOT EXISTS. * - * Includes tsvector support for hybrid search (v1.8.0): - * - Core DDL (column + populate + GIN index) works on both PostgreSQL and PGlite - * - Trigger DDL is appended but applied separately via try-catch in initializeSchema - * because PGlite does not support PL/pgSQL triggers + * Returns ONLY core DDL that works on both PostgreSQL and PGlite: + * - tsvector support for hybrid search (v1.8.0): the `tsv` column, a one-time + * populate of existing rows, and the GIN index. + * - The analytics `query_log` table (+ its indexes and the idempotent + * `request_source` ADD COLUMN for back-compat). + * - The `webhook_deliveries` table (+ its indexes). + * + * The tsvector TRIGGER is NOT included here — it is returned separately by + * {@link generateTsvTriggerDdl} and applied in its own try-catch by + * initializeSchema, because PGlite does not support PL/pgSQL triggers. */ export function generatePostSchemaMigration(): string { const coreSql = ` @@ -88,6 +94,12 @@ UPDATE chunks SET tsv = to_tsvector('english', content) WHERE tsv IS NULL; CREATE INDEX IF NOT EXISTS idx_chunks_tsv ON chunks USING GIN (tsv); -- Analytics: query_log table for tracking tool usage +-- +-- request_source tags the ORIGIN of the request (user|synthetic|analysis), +-- derived from the X-Pathfinder-Source header on the MCP init request. It is +-- distinct from source_name, which is the DATA source the tool queried (e.g. +-- "docs"). Nullable so historical rows written before this column existed read +-- back as NULL; analytics treats NULL as a real user (see db/analytics.ts). CREATE TABLE IF NOT EXISTS query_log ( id SERIAL PRIMARY KEY, tool_name TEXT NOT NULL, @@ -97,12 +109,19 @@ CREATE TABLE IF NOT EXISTS query_log ( latency_ms INTEGER NOT NULL, source_name TEXT, session_id TEXT, + request_source TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_query_log_created_at ON query_log (created_at); CREATE INDEX IF NOT EXISTS idx_query_log_tool_name ON query_log (tool_name); +-- request_source added after query_log shipped — ADD COLUMN IF NOT EXISTS keeps +-- the migration idempotent and back-compatible for installs whose query_log +-- predates the column. The CREATE TABLE above carries it for fresh installs. +ALTER TABLE query_log ADD COLUMN IF NOT EXISTS request_source TEXT; +CREATE INDEX IF NOT EXISTS idx_query_log_request_source ON query_log (request_source); + -- Webhook delivery tracking CREATE TABLE IF NOT EXISTS webhook_deliveries ( id SERIAL PRIMARY KEY, diff --git a/src/indexing/chunking/index.ts b/src/indexing/chunking/index.ts index ec56801..4615208 100644 --- a/src/indexing/chunking/index.ts +++ b/src/indexing/chunking/index.ts @@ -6,6 +6,7 @@ type ChunkerFn = ( content: string, filePath: string, config: SourceConfig, + absoluteFilePath?: string, ) => ChunkOutput[]; const registry = new Map(); diff --git a/src/indexing/chunking/markdown.ts b/src/indexing/chunking/markdown.ts index 64a75be..7598ed2 100644 --- a/src/indexing/chunking/markdown.ts +++ b/src/indexing/chunking/markdown.ts @@ -1,17 +1,140 @@ // Recursive markdown/MDX splitter import { type ChunkOutput, type SourceConfig } from "../../types.js"; - -export interface MarkdownChunk { - content: string; - title: string; - headingPath: string[]; - chunkIndex: number; -} +import { inlineSnippetImports } from "./snippets.js"; const DEFAULT_TARGET_TOKENS = 600; const DEFAULT_OVERLAP_TOKENS = 50; +// ── Shared CommonMark predicates ─────────────────────────────────────────── +// +// The chunker has several detection paths — extractFirstHeading, +// getHeadingPathAtPosition, splitOnHeading, segmentCodeBlocks — that historically +// each carried its OWN regex and disagreed on CommonMark edge cases (indent, +// separator, closing-`#` sequences, fence length/indent). The constants and +// helpers below are the SINGLE source of truth they all build on, so they cannot +// drift apart. + +// ATX heading indent: 0–3 leading SPACES only. A leading tab counts as 4 columns +// in CommonMark, so a tab-indented `#` line is an indented code line, NOT a +// heading. (This is why the fragment is ` {0,3}`, never `[ \t]{0,3}`.) +const HEADING_INDENT = " {0,3}"; +// Separator between the `#`-run and the heading text: one-or-more space OR tab +// (CommonMark). Using `[ \t]+` (not `\s+`) keeps the separator on the same line: +// a bare `#`/`##` line with no inline text must NOT skip the newline and adopt +// the following line as its text. +const HEADING_SEP = "[ \\t]+"; + +// Single-line form of the shared heading predicate (no `g`/`m`; the input is one +// line with no embedded newline). Capture group 1 is the `#`-run, group 2 is the +// raw heading text (still carrying any trailing closing-`#` sequence / `{#anchor}`, +// which stripHeadingText removes). `.*` is safe because a single line has no `\n`. +const HEADING_LINE_RE = new RegExp( + `^${HEADING_INDENT}(#{1,6})${HEADING_SEP}(\\S.*)$`, +); + +/** + * The ONE shared heading predicate. Given a SINGLE line (no embedded newline), + * return its CommonMark ATX heading level + normalized text, or null. All + * text-extracting callers (extractFirstHeading, getHeadingPathAtPosition's + * leading-line check, and its line-by-line scan) funnel through this, so they + * cannot disagree on indent / separator / closing-`#` / `{#anchor}`. splitOnHeading + * does not extract text, so it shares the same HEADING_INDENT + HEADING_SEP + * fragments via a level-specific boundary lookahead (see splitOnHeading). + */ +function matchHeadingLine( + line: string, +): { level: number; text: string } | null { + const m = HEADING_LINE_RE.exec(line); + if (!m) return null; + const level = m[1].length; + const text = stripHeadingText(m[2]); + if (!text) return null; + return { level, text }; +} + +/** + * Return the heading the slice OPENS with (its first line), or null. Used to + * extend the heading-path scan to include a chunk's own leading heading without + * matching a heading that appears later in the slice. + */ +function matchLeadingHeading( + slice: string, +): { level: number; text: string } | null { + const nl = slice.indexOf("\n"); + const firstLine = nl === -1 ? slice : slice.slice(0, nl); + return matchHeadingLine(firstLine); +} + +/** + * Normalize captured ATX heading text to its CommonMark display form: + * - strip an optional trailing closing `#`-sequence (`## Heading ##` → "Heading"; + * `# Title #` → "Title"). The closing run must be preceded by whitespace (or be + * the whole text), so `foo###` keeps its hashes per CommonMark. + * - strip a trailing docs anchor `{#some-id}` (Docusaurus/Nextra), so the + * embedded title/headingPath does not carry anchor noise. + * + * title and headingPath are embedded into the retrieval vector, so all three + * heading detectors route their captured text through this one function and thus + * agree on the final text. + * + * ORDER IS LOAD-BEARING: the closing-`#` sequence is stripped FIRST, then the + * `{#anchor}`. On `## X {#a} ##` this yields "X"; reversing the two passes + * (anchor-first) would leave "X {#a}". The test oracle's normalizeHeadingText + * MUST mirror this exact order or the two disagree and the structural-invariant + * soundness check falsely fails. + */ +function stripHeadingText(raw: string): string { + let text = raw.trim(); + // Trailing closing `#`-sequence (preceded by whitespace, or the whole text). + text = text.replace(/(^|[ \t])#+[ \t]*$/, "$1").trimEnd(); + // Trailing docs anchor `{#anchor}` (optionally followed by spaces/tabs). + text = text.replace(/[ \t]*\{#[^}]*\}[ \t]*$/, "").trimEnd(); + return text; +} + +interface FenceMarker { + char: string; // "`" or "~" + len: number; // run length of the opening fence +} + +/** + * Shared fence-open predicate. An OPENING code fence is 0–3 leading spaces, then + * a run of ≥3 backticks or ≥3 tildes (CommonMark allows the same 0–3-space indent + * as a heading; column 0 is NOT required). Returns the fence char + run length, + * or null. + * + * CommonMark §4.5: a BACKTICK fence's info string may NOT contain a backtick + * (otherwise the line is an inline code span, e.g. `` ```js``` inline ``, not a + * fence opener); a TILDE fence's info string may contain backticks but not + * tildes. Rejecting such lines here is what stops a col-0 inline triple-backtick + * span from opening a phantom fence that runs to EOF and masks every following + * heading as "code". + */ +function matchFenceOpen(line: string): FenceMarker | null { + const m = line.match(/^ {0,3}(`{3,}|~{3,})([^\n]*)$/); + if (!m) return null; + const char = m[1][0]; + // The info string (rest of the line after the opening run) must not contain + // the fence character: backticks are forbidden in a backtick fence's info + // string, tildes in a tilde fence's. A real opener's info string is just a + // language tag (no fence char), so this never rejects a genuine fence. + if (m[2].includes(char)) return null; + return { char, len: m[1].length }; +} + +/** + * Shared fence-close predicate. A CLOSING fence uses the SAME fence char, 0–3 + * leading spaces, a run of length GREATER-OR-EQUAL to the opener (CommonMark + * permits a longer closing fence), and only trailing spaces/tabs after the run + * (a closing fence carries no info string). + */ +function isFenceClose(line: string, open: FenceMarker): boolean { + const fenceChar = open.char === "`" ? "\\`" : "~"; + const re = new RegExp(`^ {0,3}(${fenceChar}{${open.len},})[ \\t]*$`); + return re.test(line); +} + /** * Parse YAML frontmatter from markdown content. * Returns the title (if found) and the content with frontmatter stripped. @@ -26,50 +149,550 @@ function parseFrontmatter(content: string): { const frontmatter = match[1]; const body = content.slice(match[0].length); - const titleMatch = frontmatter.match(/^title:\s*["']?(.+?)["']?\s*$/m); + // Capture the raw title value (everything after `title:` on its line), then + // strip ONLY a BALANCED pair of surrounding quotes. The prior regex used two + // INDEPENDENT `["']?` around a `(.+?)` that required ≥1 char, so `title: ""` + // captured a stray `"` (and `title: ''` a stray `'`), which is truthy and + // defeats the `fmTitle || extractFirstHeading || filename` fallback — embedding + // a lone quote as the title. The `(["'])([\s\S]*)\1` backreference strips a + // quote pair only when BOTH ends match; the GREEDY `[\s\S]*` makes the + // backreference bind to the LAST same-quote char, so only the OUTERMOST + // balanced pair is removed (e.g. `"a "b" c"` → `a "b" c`). An unbalanced or + // absent quote leaves the value literal (so `5'6" tall` keeps its internal + // quotes). An empty or whitespace-only result is treated as ABSENT (null) so + // the heading/filename fallback applies. + const titleMatch = frontmatter.match(/^title:[ \t]*(.*?)[ \t]*$/m); + let title: string | null = null; + if (titleMatch) { + const raw = titleMatch[1]; + const quoted = raw.match(/^(["'])([\s\S]*)\1$/); + const value = (quoted ? quoted[2] : raw).trim(); + title = value === "" ? null : value; + } return { - title: titleMatch ? titleMatch[1].trim() : null, + title, body, }; } /** * Extract the first heading from content to use as fallback title. + * + * Fenced code is masked first (via segmentCodeBlocks): a `#`-prefixed line inside + * a ``` or ~~~ fence is example/documentation text, not a heading, so a doc that + * OPENS with a fenced block whose first line is `# something` must not adopt that + * line as its title. The heading match uses the SHARED predicate (matchHeadingLine + * → stripHeadingText), so the indent rule (0–3 spaces, never a tab), the + * space-or-tab separator, and the closing-`#`/`{#anchor}` stripping are identical + * to getHeadingPathAtPosition. The first heading found scanning non-code segments + * in order wins. */ function extractFirstHeading(content: string): string | null { - const match = content.match(/^#{1,6}\s+(.+)$/m); - return match ? match[1].trim() : null; + for (const segment of segmentCodeBlocks(content)) { + if (segment.isCodeBlock) continue; + for (const line of segment.text.split("\n")) { + const heading = matchHeadingLine(line); + if (heading) return heading.text; + } + } + return null; } /** * Strip MDX-specific syntax: import statements and JSX component tags. * Preserves text content inside JSX tags. + * + * The strip passes are FENCE- and CODE-SPAN-AWARE: they run ONLY over non-code + * segments (fenced ``` / ~~~ blocks pass through VERBATIM via segmentCodeBlocks), + * and within a non-code segment the inline code spans (single/multi backtick + * `` `...` ``) are masked so JSX/import-looking text inside them survives too. + * Fenced code (e.g. a ```tsx block with `import {X} from 'y'` and ``) + * is the highest-value retrieval content Pathfinder serves, so it must never be + * gutted by the MDX strip. */ function stripMdx(content: string): string { - // Strip import statements (single and multi-line) - let result = content.replace( - /^import\s+[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, - "", - ); + const out = segmentCodeBlocks(content) + .map((segment) => + segment.isCodeBlock ? segment.text : stripNonCodeMdx(segment.text), + ) + .join(""); + return trimBlankEdges(out); +} + +/** + * Trim leading/trailing BLANK LINES (and trailing whitespace) while PRESERVING + * the first content line's intra-line leading indentation. + * + * A plain `.trim()` here strips the leading whitespace of the document's FIRST + * content line before the spaces-only ATX-heading detectors run, so a doc whose + * first non-blank line is `\t# X` or ` # X` (4+ spaces / tab = CommonMark + * INDENTED CODE, NOT a heading) would have its indent removed and be wrongly + * promoted to a heading/title — violating the file's `HEADING_INDENT = " {0,3}"` + * invariant. Removing only WHOLE leading blank lines (`^(?:[ \t]*\n)+`) keeps + * the prior "leading blank lines are ignored" behavior while letting the 0–3 + * vs 4+ space rule govern the first line too; trailing whitespace is stripped as + * before, so the verbatim-substring binding downstream is unaffected. + */ +function trimBlankEdges(text: string): string { + // Remove leading lines that are entirely blank (spaces/tabs then a newline), + // but NOT the intra-line indentation of the first line that has content. + const noLeadingBlankLines = text.replace(/^(?:[ \t]*\n)+/, ""); + // Strip trailing whitespace (incl. trailing newlines), matching .trim()'s end. + return noLeadingBlankLines.replace(/\s+$/, ""); +} - // Strip self-closing JSX tags: - result = result.replace(/<[A-Z][A-Za-z0-9]*(?:\s+[^>]*)?\s*\/>/g, ""); +/** + * Trim a produced chunk's edges for the SERVED/indexed content. Like `.trim()` + * (the prior behavior) it removes leading blank lines + leading whitespace and + * all trailing whitespace, EXCEPT it preserves a 4+-space (CommonMark + * INDENTED-CODE) indent on the first content line. + * + * A plain `.trim()` strips the leading indent of a chunk whose first content line + * is 4-space-indented — e.g. a doc that OPENS with ` ```lang` (CommonMark + * indented code, NOT a fence) — turning it into a COLUMN-0 ` ```lang ` whose + * still-indented closing ` ``` ` no longer closes it, leaving a HALF-OPEN + * fence in the served chunk text. A 0–3-space (cosmetic) leading indent is still + * stripped, so an indented HEADING like ` ## Section` continues to be served at + * column 0 (preserving the existing stored-content contract); only a 4+-space + * (semantic, indented-code) leading run is kept so the 0–3 vs 4+ space rule keeps + * governing the served content. Trailing whitespace is always stripped, matching + * `.trim()`'s end. + */ +function trimChunkEdges(text: string): string { + // Drop whole leading blank lines first (spaces/tabs then newline). + const noLeadingBlankLines = text.replace(/^(?:[ \t]*\n)+/, ""); + // The first content line is CommonMark INDENTED CODE when it starts with 4+ + // spaces OR a tab (a tab counts as 4 columns). Preserve that semantic indent; + // otherwise strip leading whitespace as .trim() did (so a 0–3-space cosmetic + // indent on a heading is still served at column 0). + const isIndentedCode = /^(?: {4}|\t)/.test(noLeadingBlankLines); + const body = isIndentedCode + ? noLeadingBlankLines + : noLeadingBlankLines.replace(/^\s+/, ""); + return body.replace(/\s+$/, ""); +} + +interface JsxTag { + kind: "self" | "open" | "close"; + name: string; + // [start, end) byte range of the whole tag in the source string. + start: number; + end: number; +} - // Strip JSX component open/close tags but keep inner content - // Handles nested tags by repeatedly stripping innermost pairs - let prev = ""; - while (prev !== result) { - prev = result; - result = result.replace( - /<([A-Z][A-Za-z0-9]*)(?:\s+[^>]*)?>([^]*?)<\/\1>/g, - "$2", - ); +/** + * Try to parse a JSX tag whose `<` is at `text[start]`. Returns the parsed tag + * (self-closing, opening, or closing) and its end offset, or null when the run + * starting at `<` is not a well-formed JSX component tag (so the caller emits + * the `<` as literal text). + * + * This is the LINEAR-time replacement for the two backtracking strip regexes. + * It walks the tag exactly once: a component name (`[A-Za-z_$][A-Za-z0-9_$.]*`, + * covering member expressions like `Tabs.Tab` / `motion.div` and `_`/`$`), then + * an attribute region whose `>`/`/>` terminator is found by tracking nesting + * state instead of an ambiguous alternation: + * - inside a `"…"` or `'…'` string, a `>` (or `/`) is literal — this is what + * lets `` keep the quoted `>`; + * - inside a `{…}` JSX expression (brace-depth > 0, with nested braces and + * quoted strings honored), a `>` is literal — this strips ` c}/>`, + * which the old `[^>]*` truncated at the inner `>` and left behind; + * - at the top level, the FIRST `>` ends the tag, and a `/` immediately before + * it marks a self-closing tag. + * Because every character is consumed at most once with O(1) state, total work + * is linear in the tag length regardless of attribute count — no backtracking. + */ +function parseJsxTagAt(text: string, start: number): JsxTag | null { + const n = text.length; + let i = start + 1; // past "<" + const isClose = text[i] === "/"; + if (isClose) i++; + + // Component name. JSX components start uppercase or `_`/`$`, but the strip has + // always matched a leading lowercase too via the class below; keep that class + // so member expressions and the existing accepted names parse identically. + if (i >= n || !/[A-Za-z_$]/.test(text[i])) return null; + const nameStart = i; + i++; + while (i < n && /[A-Za-z0-9_$.]/.test(text[i])) i++; + const name = text.slice(nameStart, i); + + if (isClose) { + // `` — only whitespace allowed before the closing `>`. + while (i < n && /\s/.test(text[i])) i++; + if (i < n && text[i] === ">") { + return { kind: "close", name, start, end: i + 1 }; + } + return null; + } + + // Opening or self-closing tag: scan the attribute region for the terminating + // `>` / `/>`, tracking quote and `{…}` expression nesting so an inner `>` is + // not mistaken for the tag end. + let braceDepth = 0; + let quote: '"' | "'" | "`" | null = null; + while (i < n) { + const ch = text[i]; + if (quote) { + if (ch === quote) quote = null; + i++; + continue; + } + if (ch === '"' || ch === "'" || ch === "`") { + quote = ch; + i++; + continue; + } + if (ch === "{") { + braceDepth++; + i++; + continue; + } + if (ch === "}") { + if (braceDepth > 0) braceDepth--; + i++; + continue; + } + if (braceDepth > 0) { + // Inside a JSX expression: `>` and `/` are literal. + i++; + continue; + } + if (ch === ">") { + const selfClosing = i > start && text[i - 1] === "/"; + return { kind: selfClosing ? "self" : "open", name, start, end: i + 1 }; + } + i++; + } + // Reached EOF without a terminating `>`: not a complete tag. + return null; +} + +/** + * Strip JSX component tags from a (heading- and inline-code-masked) text in a + * SINGLE linear pass: self-closing `` tags are removed entirely, and a + * matched `inner` pair is reduced to `inner` (innermost-out, so + * nested pairs are all unwrapped). An OPENING tag with no matching same-name + * closer, or a CLOSING tag with no opener, is left VERBATIM — preserving the old + * regexes' behavior (the paired regex required a `\1` close; the self-closing + * regex required `/>`), so a lone `` with no `` is not eaten. + * + * Linear time: parseJsxTagAt consumes each tag's characters once with O(1) + * state, and the matching uses a stack of open-tag positions, so there is no + * catastrophic backtracking on any attribute count (the bug the old + * `(?:"[^"]*"|'[^']*'|[^>])*` alternation introduced). + */ +function stripJsxTags(text: string): string { + const n = text.length; + // First, locate every well-formed tag (linear scan). Characters NOT part of a + // tag are literal text. A `<` that does not parse as a tag is literal too. + const tags: JsxTag[] = []; + let i = 0; + while (i < n) { + if (text[i] === "<") { + const tag = parseJsxTagAt(text, i); + if (tag) { + tags.push(tag); + i = tag.end; + continue; + } + } + i++; } + if (tags.length === 0) return text; - // Clean up excessive blank lines left by stripping + // Decide which tag ranges to DROP (their markup is removed; any text between a + // matched open/close pair is kept). Self-closing tags always drop. For paired + // tags, match each close to the NEAREST preceding unmatched open of the same + // name (mirrors the lazy `…?` regex), via a stack. + const drop = new Set(); // indices into `tags` whose markup is removed + const openStack: number[] = []; // indices of currently-open `open` tags + for (let t = 0; t < tags.length; t++) { + const tag = tags[t]; + if (tag.kind === "self") { + drop.add(t); + } else if (tag.kind === "open") { + openStack.push(t); + } else { + // close: find the nearest open of the same name on the stack. + let k = openStack.length - 1; + while (k >= 0 && tags[openStack[k]].name !== tag.name) k--; + if (k >= 0) { + drop.add(openStack[k]); // matched open + drop.add(t); // this close + openStack.length = k; // unmatched opens above it stay (verbatim) + } + // An unmatched close stays verbatim (not added to `drop`). + } + } + + // Rebuild: emit literal text, skip dropped tag ranges, and emit the verbatim + // source of any tag we did not drop (unmatched open/close). + let out = ""; + let cursor = 0; + for (let t = 0; t < tags.length; t++) { + const tag = tags[t]; + out += text.slice(cursor, tag.start); // literal text before this tag + if (!drop.has(t)) { + out += text.slice(tag.start, tag.end); // keep an unmatched tag verbatim + } + cursor = tag.end; + } + out += text.slice(cursor); + return out; +} + +/** + * Apply the MDX strip passes to a single NON-code segment. ATX heading lines and + * inline code spans are masked with placeholders first so their (possibly + * JSX/import-looking) content is preserved verbatim, then restored after the + * passes. + * + * HEADING-AWARENESS (load-bearing): the JSX/import strip regexes are global and + * NOT line-anchored, so without protection a `` / `..` + * that NAMES a component INSIDE an ATX heading (`## The Provider`, + * `## `) would be stripped — corrupting the heading in title, headingPath + * AND served content (all derive from the post-strip body). CopilotKit docs + * routinely name components in headings, so heading lines are masked (like inline + * code and fenced code) and pass through VERBATIM. Heading lines are masked FIRST + * (on the original text) so any inline code inside a heading is hidden as part of + * the opaque heading unit; the remaining prose is then inline-masked and stripped + * normally, so JSX on PROSE lines is still removed. + */ +function stripNonCodeMdx(text: string): string { + const { masked: headingMasked, restore: restoreHeadings } = + maskHeadingLines(text); + const { masked, restore } = maskInlineCode(headingMasked); + let result = masked; + + // Strip side-effect imports first: `import "./x.css";` (no `from`). Handled + // separately from `from`-imports so the lazy body of a `from`-import can never + // jump across a side-effect import. The inter-token whitespace is `[ \t]+` + // (NOT `\s+`): `\s` matches a newline, so the old `\s+` let `import\n\n"./x"` + // match as ONE statement across a blank line and deleted the whole span + // (destroying any prose masked between). `[ \t]+` keeps the statement on a + // single logical line, so a newline (let alone a blank line) can never be + // spanned — mirroring the `from`-import fix below. + result = result.replace(/^import[ \t]+['"][^'"]+['"];?[ \t]*$/gm, ""); + + // Strip a single `from`-import statement, including the optional TypeScript + // `type` modifier (`import type { Config } from "x"`, `import type Default from + // "m"`, `import type * as NS from "m"`). The import CLAUSE between `import` + // (and the optional `type`) and `from` is constrained to a real binding + // grammar — `{ named }`, `* as NS`, or a default identifier optionally followed + // by `, { named }`. The inter-token whitespace is `[ \t]+` (NOT `\s+`): `\s` + // matches a newline, so the old `\s+` around `from` let `import Config\n\nfrom + // "y";` match as ONE statement and deleted the whole span across the blank + // line. Using `[ \t]+` keeps the statement on a single logical line. + // + // BRACE CONTENT IS LINE-BOUNDED: each named-import brace is `\{[^}\n]*\}`, NOT + // `\{[^}]*\}`. `[^}]` ALSO matches a newline, so a dangling `import {` whose + // closing `}` is lines away greedily consumed the intervening ATX headings and + // prose and DELETED them silently (no warning) — `import {\n## Heading\nprose\n} + // from "x";` ate the heading + prose. Excluding `\n` from the brace content + // means an unclosed `{` cannot swallow subsequent lines: the `{named}` + // alternative only matches when the closing `}` is on the SAME line as the + // opening `{`. CONSEQUENCE (intentional, minimal): a WELL-FORMED multi-line + // brace import (`import Foo, {\n X,\n} from "y";`) is no longer stripped — its + // identifiers are on separate lines, so `[^}\n]*` does not span them. That is + // an accepted trade: the hard requirement is that an `import {` must NEVER + // delete markdown headings/prose, and the existing import tests only require + // SINGLE-LINE brace imports to strip (which still do). A left-in multi-line + // import is cosmetically present in served text but causes no CONTENT LOSS. + // + // This (a) cannot span a blank line or jump to a LATER import's `from` (each + // alternative is bounded: braces stop at the first `}` OR the line end, a + // default is a single identifier), and (b) does not match prose like + // `import a value from "x"` (two bare words is not a valid import clause), so + // ordinary sentences are not deleted. + result = result.replace( + /^import[ \t]+(?:type[ \t]+)?(?:\{[^}\n]*\}|\*[ \t]+as[ \t]+[A-Za-z_$][\w$]*|[A-Za-z_$][\w$]*(?:[ \t]*,[ \t]*\{[^}\n]*\})?)[ \t]+from[ \t]+['"][^'"]+['"];?[ \t]*$/gm, + "", + ); + + // Strip JSX component tags (self-closing `` removed entirely; + // paired `inner` reduced to its inner content, innermost-out) + // via a SINGLE LINEAR scan — see stripJsxTags. This replaces the prior pair of + // regexes whose attribute run `(?:"[^"]*"|'[^']*'|[^>])*` was ambiguous (a + // quoted attr matched BOTH alternatives), giving exponential backtracking that + // hung on a prop-heavy paired tag or a no-close opener with ~18+ attributes — + // a blow-up INSIDE one `.replace()` that no outer pass cap could bound. The + // scanner is linear in the input length on any attribute count, and (unlike + // the old `[^>]*`) tracks `{…}` JSX-expression depth so a `>` inside an + // unquoted expression (` c} />`) no longer truncates the tag. + result = stripJsxTags(result); + + // Clean up excessive blank lines left by stripping (within this segment only, + // so a fenced block's internal blank lines in a NEIGHBOURING code segment are + // never collapsed). result = result.replace(/\n{3,}/g, "\n\n"); - return result.trim(); + // Restore inline code spans first, then heading lines (headings were masked + // first, on the original text, so they must be restored last to re-emit the + // verbatim heading — including any inline code it contained). + return restoreHeadings(restore(result)); +} + +// Private-Use-Area sentinels delimiting a masked ATX HEADING line. Distinct from +// the inline-code sentinels (U+E000/U+E001) so the two maskers never collide. +const HEADING_OPEN = String.fromCharCode(0xe002); +const HEADING_CLOSE = String.fromCharCode(0xe003); + +/** + * Mask whole ATX heading lines with sentinel-delimited placeholders so the MDX + * strip passes leave them untouched, preserving a component tag that NAMES a + * component inside a heading (`## The Provider`, `## `). + * Returns the masked text and a restore() that re-inserts the original heading + * lines verbatim. + * + * A line is masked when it is a CommonMark ATX heading per the SHARED predicate + * (matchHeadingLine → HEADING_LINE_RE): 0–3 leading spaces, 1–6 `#`, a space/tab + * separator, then non-space text. Only the line's CONTENT is replaced; the + * surrounding newlines are preserved, so block structure (and the later + * `\n{3,}` collapse) is unaffected. Non-heading (prose) lines pass through so + * JSX/imports in prose are still stripped. + */ +function maskHeadingLines(text: string): { + masked: string; + restore: (s: string) => string; +} { + const headings: string[] = []; + const masked = text + .split("\n") + .map((line) => { + if (!matchHeadingLine(line)) return line; + const token = `${HEADING_OPEN}${headings.length}${HEADING_CLOSE}`; + headings.push(line); + return token; + }) + .join("\n"); + + const restoreRe = new RegExp(`${HEADING_OPEN}(\\d+)${HEADING_CLOSE}`, "g"); + // Pass the original match through unchanged when the captured index is out of + // range (defensive: a sentinel-shaped sequence we did not emit must never map + // to `undefined`). chunkMarkdown strips literal PUA sentinels from input up + // front, so this only guards against an internal invariant break. + const restore = (s: string): string => + s.replace(restoreRe, (_m, idx) => headings[Number(idx)] ?? _m); + return { masked, restore }; +} + +// Private-Use-Area sentinels delimiting an inline-code-span placeholder. PUA +// code points (U+E000, U+E001) cannot appear in real markdown source, so a +// placeholder like `3` can never collide with document text. Built +// via fromCharCode so the source file stays plain ASCII. +const CODESPAN_OPEN = String.fromCharCode(0xe000); +const CODESPAN_CLOSE = String.fromCharCode(0xe001); + +/** + * Mask inline code spans with sentinel-delimited placeholders so the MDX strip + * passes leave their content untouched. Returns the masked text and a restore() + * that re-inserts the original spans verbatim. + * + * CommonMark inline-code rule (§6.1): a code span opens with a backtick RUN of + * length N and closes with a backtick run of EXACTLY length N — a longer run is + * NOT a valid closer, and the opening/closing runs must not be flanked by more + * backticks (the run-length is delimited by a non-backtick on the outer side). + * A single `/(`+)([\s\S]*?)\1/` regex does NOT enforce the exact-length rule + * (`\1` only requires the same TEXT, and greedy `(`+)` + backtracking mispairs + * delimiters around a stray/odd backtick), leaving a real span unmasked so the + * JSX/import strip then guts it. This scanner pairs runs by exact length so a + * stray backtick between two genuine spans can no longer mis-delimit them. + * + * Inline parsing also never crosses a BLANK LINE or an ATX HEADING line: + * CommonMark runs block parsing first, so a blank line splits text into separate + * paragraphs and a heading is its own block — a code span cannot span either + * boundary. Without this, a stray backtick on one line would wrongly pair with a + * backtick beyond the boundary, exposing the intervening content (e.g. ``) + * to the JSX strip. maskHeadingLines runs FIRST, so heading lines are already + * opaque HEADING_OPEN…HEADING_CLOSE tokens here; the closer-search therefore + * stops at a paragraph break AND at a heading-sentinel boundary. + */ +function maskInlineCode(text: string): { + masked: string; + restore: (s: string) => string; +} { + const spans: string[] = []; + let out = ""; + let i = 0; + const n = text.length; + + // True when a blank line (paragraph break) begins at the newline `text[p]`: + // the run of whitespace starting at p contains a SECOND newline before any + // non-whitespace char. A code span cannot cross such a boundary. + const blankLineAt = (p: number): boolean => { + let q = p + 1; // skip the first newline + while (q < n && (text[q] === " " || text[q] === "\t")) q++; + return q < n && text[q] === "\n"; + }; + + while (i < n) { + if (text[i] !== "`") { + out += text[i]; + i++; + continue; + } + // Measure the opening backtick run [i, openEnd). + let openEnd = i; + while (openEnd < n && text[openEnd] === "`") openEnd++; + const runLen = openEnd - i; + + // Find the next backtick run of EXACTLY runLen (not part of a longer run), + // without crossing a blank line (paragraph break) OR an ATX heading line. + let close = -1; + let scan = openEnd; + while (scan < n) { + if (text[scan] === "\n" && blankLineAt(scan)) { + // Paragraph break before any valid closer: this opening run cannot close. + break; + } + if (text[scan] === HEADING_OPEN) { + // ATX heading block boundary before any valid closer. maskHeadingLines + // ran first, so a heading line is now an opaque HEADING_OPEN… + // HEADING_CLOSE token; reaching its HEADING_OPEN means a closer would lie + // on the far side of a heading block. CommonMark parses blocks before + // inlines, so an inline code span cannot cross a heading — this opening + // run therefore cannot close (mirrors the blank-line guard above). + break; + } + if (text[scan] !== "`") { + scan++; + continue; + } + let candEnd = scan; + while (candEnd < n && text[candEnd] === "`") candEnd++; + if (candEnd - scan === runLen) { + close = scan; + break; + } + // A run of a different length cannot close this span; skip past it whole + // (a longer/shorter run is not a valid closer and its backticks are + // consumed so they cannot be re-read as a closer of the wrong length). + scan = candEnd; + } + + if (close === -1) { + // No valid closer: this opening run is literal text, not a span opener. + // Emit the run verbatim and continue scanning AFTER it. + out += text.slice(i, openEnd); + i = openEnd; + continue; + } + + // [i, close+runLen) is a complete code span (opening run + content + closer). + const full = text.slice(i, close + runLen); + const token = `${CODESPAN_OPEN}${spans.length}${CODESPAN_CLOSE}`; + spans.push(full); + out += token; + i = close + runLen; + } + + const restoreRe = new RegExp(`${CODESPAN_OPEN}(\\d+)${CODESPAN_CLOSE}`, "g"); + // Pass the original match through unchanged when the captured index is out of + // range (defensive — see maskHeadingLines): a sentinel-shaped sequence we did + // not emit must never map to `undefined`. + const restore = (s: string): string => + s.replace(restoreRe, (_m, idx) => spans[Number(idx)] ?? _m); + return { masked: out, restore }; } /** @@ -81,25 +704,78 @@ interface ContentSegment { isCodeBlock: boolean; } +/** + * Partition `content` into alternating non-code and fenced-code segments. Uses + * the SHARED fence predicates (matchFenceOpen / isFenceClose) line-by-line — + * NOT a single backreference regex — so it correctly handles every CommonMark + * fence shape the heading detectors must agree with: + * - opening fences indented 0–3 spaces (column 0 is not required); + * - a closing fence whose run is LONGER than the opener (close len ≥ open len); + * - an UNCLOSED opening fence, which runs to END OF INPUT (the remainder is all + * code, so its `#` lines are never injected as fake headings). + * + * Segment boundaries are byte-exact: a code segment spans from the opening + * fence's first column through the closing fence line's last NON-newline char, + * so the newline AFTER the closing fence stays in the following non-code segment + * (a `^`-anchored heading scan therefore still sees a heading that directly + * follows the closing fence). Concatenating all segment texts reproduces + * `content` verbatim. + */ function segmentCodeBlocks(content: string): ContentSegment[] { const segments: ContentSegment[] = []; - const codeBlockRegex = /^(`{3,})[^\n]*\n(?:[\s\S]*?\n)?\1\s*$/gm; + let lastIndex = 0; // start of the not-yet-emitted region + let pos = 0; // running offset at the start of the current line + let open: FenceMarker | null = null; + let blockStart = 0; // offset of the opening fence (when inside a fence) + + // Walk one line at a time (a "line" excludes its trailing "\n"). `pos <= + // content.length` lets a trailing newline still yield one final empty line. + while (pos <= content.length) { + const nl = content.indexOf("\n", pos); + const lineEnd = nl === -1 ? content.length : nl; // excludes the "\n" + const line = content.slice(pos, lineEnd); + + if (open) { + if (isFenceClose(line, open)) { + // Emit the preceding non-code text (if any), then the code block up to + // the END of this closing-fence line (the trailing "\n" stays outside). + if (blockStart > lastIndex) { + segments.push({ + text: content.slice(lastIndex, blockStart), + isCodeBlock: false, + }); + } + segments.push({ + text: content.slice(blockStart, lineEnd), + isCodeBlock: true, + }); + lastIndex = lineEnd; + open = null; + } + } else { + const m = matchFenceOpen(line); + if (m) { + open = m; + blockStart = pos; + } + } - let lastIndex = 0; - let match: RegExpExecArray | null; + if (nl === -1) break; + pos = nl + 1; + } - while ((match = codeBlockRegex.exec(content)) !== null) { - if (match.index > lastIndex) { + // An UNCLOSED opening fence runs to END OF INPUT: emit any preceding text and + // the remainder as one code segment. + if (open) { + if (blockStart > lastIndex) { segments.push({ - text: content.slice(lastIndex, match.index), + text: content.slice(lastIndex, blockStart), isCodeBlock: false, }); } - segments.push({ text: match[0], isCodeBlock: true }); - lastIndex = match.index + match[0].length; - } - - if (lastIndex < content.length) { + segments.push({ text: content.slice(blockStart), isCodeBlock: true }); + lastIndex = content.length; + } else if (lastIndex < content.length) { segments.push({ text: content.slice(lastIndex), isCodeBlock: false }); } @@ -121,10 +797,10 @@ function splitPreservingCodeBlocks( if (segment.isCodeBlock) { current += segment.text; } else { - const subParts = - typeof delimiter === "string" - ? segment.text.split(delimiter) - : segment.text.split(delimiter); + // String.prototype.split accepts both a string and a RegExp separator, so + // a single call handles both delimiter forms (the prior string/RegExp + // ternary had two identical branches). + const subParts = segment.text.split(delimiter); if (subParts.length === 1) { current += subParts[0]; @@ -133,7 +809,10 @@ function splitPreservingCodeBlocks( current += subParts[0]; for (let i = 1; i < subParts.length; i++) { parts.push(current); - // Re-attach the delimiter for heading-based splits + // Re-attach a string heading delimiter (defensive: the sole caller + // passes the paragraph RegExp `/\n\n+/`, so this branch is not + // exercised today; it preserves correctness if a `#`-prefixed string + // delimiter is ever passed). if (typeof delimiter === "string" && delimiter.startsWith("#")) { current = delimiter + subParts[i]; } else { @@ -157,28 +836,75 @@ interface HeadingInfo { /** * Track heading hierarchy up to a given position in the original content. + * + * Headings are detected only outside fenced code blocks (segmentCodeBlocks masks + * fenced regions): a `#`-prefixed line inside a ``` fence is documentation/example + * text, not a real heading, and must never enter a chunk's headingPath (the path + * is embedded into the retrieval vector). The heading match uses the SHARED + * single-line predicate (matchHeadingLine), so indent (0–3 spaces), separator + * (space OR tab), and closing-`#`/`{#anchor}` stripping are identical to + * extractFirstHeading and splitOnHeading — they cannot disagree. + * + * The scan also includes the heading that the chunk at `position` opens with: + * when a chunk begins with its own heading, that heading belongs in its + * headingPath even though it sits at `position` rather than strictly before it. */ function getHeadingPathAtPosition( fullContent: string, position: number, ): string[] { - const contentBefore = fullContent.slice(0, position); - const headingRegex = /^(#{1,6})\s+(.+)$/gm; + // Include the chunk's own leading heading line: if the content at `position` + // OPENS with a heading, extend the scanned region to the end of that line so + // the chunk's opening heading is captured (not just its ancestors). The + // shared leading-line predicate only matches a heading on the slice's FIRST + // line — a heading later in the chunk is found by the normal scan once a + // subsequent chunk starts there. + let scanEnd = position; + if (matchLeadingHeading(fullContent.slice(position))) { + const eol = fullContent.indexOf("\n", position); + scanEnd = eol === -1 ? fullContent.length : eol; + } + const headings: HeadingInfo[] = []; - let match: RegExpExecArray | null; - while ((match = headingRegex.exec(contentBefore)) !== null) { - const level = match[1].length; - const text = match[2].trim(); + // Segment the FULL content ONCE (fences intact) and collect only heading + // matches whose ABSOLUTE offset in fullContent is < scanEnd. Slicing to + // scanEnd BEFORE segmenting could sever a fenced block: the truncated slice + // would end in an unclosed fence that segmentCodeBlocks then runs to EOF, + // misreading the in-fence region and injecting a `#`-line inside it as a fake + // heading. Segmenting the full content keeps every fence intact; the + // absolute-offset filter then bounds the scan to the chunk's position. + // + // The scan walks each non-code segment line-by-line through the SHARED + // matchHeadingLine predicate (tracking each line's absolute offset), so it + // agrees byte-for-byte with the title extractor and the split boundary. + let segmentStart = 0; + for (const segment of segmentCodeBlocks(fullContent)) { + const thisSegmentStart = segmentStart; + segmentStart += segment.text.length; - // Remove headings at same or deeper level (new section at this level) - while ( - headings.length > 0 && - headings[headings.length - 1].level >= level - ) { - headings.pop(); + if (segment.isCodeBlock) continue; + // Skip whole segments that start at or beyond scanEnd entirely. + if (thisSegmentStart >= scanEnd) continue; + + let lineOffset = 0; // offset of the current line WITHIN this segment + for (const line of segment.text.split("\n")) { + const absoluteIndex = thisSegmentStart + lineOffset; + lineOffset += line.length + 1; // +1 for the "\n" split removed + if (absoluteIndex >= scanEnd) break; + + const heading = matchHeadingLine(line); + if (!heading) continue; + + // Remove headings at same or deeper level (new section at this level) + while ( + headings.length > 0 && + headings[headings.length - 1].level >= heading.level + ) { + headings.pop(); + } + headings.push({ level: heading.level, text: heading.text }); } - headings.push({ level, text }); } return headings.map((h) => h.text); @@ -187,10 +913,22 @@ function getHeadingPathAtPosition( /** * Split text on heading boundaries at a specific level. * Re-attaches the heading marker to each section. + * + * The boundary is the SHARED heading predicate fixed at `level` hashes, so it + * agrees with extractFirstHeading and getHeadingPathAtPosition on indent (0–3 + * spaces) and separator (space OR tab). Two historical disagreements are closed + * here: (a) the boundary was anchored at column 0 only, so a 1–3-space-indented + * heading fed the headingPath but was NOT a section boundary; (b) it required a + * literal SPACE after the hashes, so a TAB-separated heading (`##\tHeading`) was + * not a boundary. Because the separator `[ \t]+` must follow EXACTLY `level` + * hashes, a deeper heading (e.g. `###` for a level-2 split) does NOT match — its + * third `#` is not a space/tab — so it is split at its own level instead. */ function splitOnHeading(content: string, level: number): string[] { - const prefix = "#".repeat(level) + " "; - const regex = new RegExp(`(?=^${prefix.replace(/ $/, " ")})`, "gm"); + const regex = new RegExp( + `(?=^${HEADING_INDENT}#{${level}}${HEADING_SEP})`, + "gm", + ); const segments = segmentCodeBlocks(content); const parts: string[] = []; @@ -233,7 +971,6 @@ function recursiveSplit( let parts: string[]; if (depth === 0) { - // Split on ## headings parts = splitOnHeading(content, 2); if (parts.length > 1) { return parts.flatMap((p) => recursiveSplit(p, targetChars, 1)); @@ -241,7 +978,6 @@ function recursiveSplit( } if (depth <= 1) { - // Split on ### headings parts = splitOnHeading(content, 3); if (parts.length > 1) { return parts.flatMap((p) => recursiveSplit(p, targetChars, 2)); @@ -249,7 +985,6 @@ function recursiveSplit( } if (depth <= 2) { - // Split on paragraph boundaries parts = splitPreservingCodeBlocks(content, /\n\n+/); if (parts.length > 1) { return mergeSmallParts(parts, targetChars).flatMap((p) => @@ -258,25 +993,102 @@ function recursiveSplit( } } - // Split on line boundaries - const lines = content.split("\n"); - if (lines.length > 1) { - return mergeSmallParts(lines, targetChars); + // Fence-aware line-split fallback. A raw `content.split("\n")` here would + // shred an oversized fenced code block across chunk boundaries (severing its + // open/close fences) and collapse its internal blank lines — both break the + // verbatim-substring fidelity that lets chunkMarkdown's `indexOf(rawText)` + // bind the heading path, degrading it to []. So the split keeps each fenced + // code block ATOMIC: a code block is emitted as its own unit (whole, even when + // it alone exceeds targetChars), and only non-code text is line-split. + // + // Each fenced segment maps to ONE unit; each non-code segment maps to its + // individual lines (split on `\n`). Because segmentCodeBlocks leaves the + // newline that joins a code block to its neighbours attached to the + // surrounding non-code segment, `.join("\n")` of these units reproduces the + // source verbatim; mergeSmallParts (called with "\n") re-inserts exactly one + // newline between consecutive units — skipping the separator whenever a unit + // already ends in a newline — so the reassembled chunk text stays a verbatim + // substring of the source. Newline FIDELITY is therefore handled downstream by + // mergeSmallParts, not by any per-unit "never-break" bookkeeping here. + // + // INLINE-SPAN ATOMICITY: fenced blocks are kept whole above, but an INLINE + // code span (`` `…` ``) can also straddle a soft line break, and a per-line + // split would land its two halves in adjacent units. When mergeSmallParts' + // boundary then falls between them, one chunk carries an unbalanced backtick — + // it opens an inline span it never closes in served text. So consecutive + // non-code lines whose JOIN sits inside an OPEN inline-code run (the backticks + // seen since the current unit began are odd) are kept in the SAME unit. A + // grouped unit is exactly its lines `\n`-joined — identical to what merging + // those lines with "\n" would yield — so the verbatim-substring invariant is + // unchanged. A blank line (paragraph break) always ends a unit: CommonMark + // inline parsing never crosses it, so an odd count there is an unclosed source + // span, not a span we may keep swallowing lines to balance. + const segments = segmentCodeBlocks(content); + const units: string[] = []; + for (const segment of segments) { + if (segment.isCodeBlock) { + units.push(segment.text); + continue; + } + const lines = segment.text.split("\n"); + let group: string[] = []; + const flush = () => { + if (group.length > 0) { + units.push(group.join("\n")); + group = []; + } + }; + for (const line of lines) { + // A blank line is a paragraph boundary: CommonMark inline parsing never + // crosses it, so it ALWAYS ends the current unit (an inline span still open + // here is an unclosed source span, not one we may keep swallowing lines for). + if (line.trim() === "") { + flush(); + units.push(line); + continue; + } + group.push(line); + // Close the unit only at a balance point (no inline span left open). The + // test is FENCE-AWARE + EXACT-RUN-LENGTH (inlineCodeOpenAtEnd), not a raw + // backtick parity: a parity test flushed mid-span on a double-backtick + // `` `` … `` `` span (2 backticks on the opening line ⇒ even ⇒ flush), + // landing the opener and closer in adjacent units; and it miscounted a + // ```` ``` ```` fence delimiter (3 backticks, odd) as an open inline span. + // If a span is still open, keep the next line in this same unit so it stays + // whole. + if (!inlineCodeOpenAtEnd(group.join("\n"))) flush(); + } + flush(); // trailing group with an unclosed source span emitted as-is + } + + if (units.length > 1) { + return mergeSmallParts(units, targetChars, "\n"); } - // Content is a single very long line; return as-is + // Content is a single very long line (or one indivisible code block); return + // as-is. return [content]; } /** * Merge adjacent small parts until they approach the target size. + * + * @param joinSeparator - String inserted between two merged parts. Defaults to + * a blank line ("\n\n") for paragraph-level callers, which rejoin paragraphs. + * The line-split fallback passes "\n" so single-newline structure (and thus + * verbatim-substring fidelity with the source) is preserved. A part that + * already ends in a newline is joined with no extra separator regardless. */ -function mergeSmallParts(parts: string[], targetSize: number): string[] { +function mergeSmallParts( + parts: string[], + targetSize: number, + joinSeparator: string = "\n\n", +): string[] { const merged: string[] = []; let current = ""; for (const part of parts) { - const separator = current && !current.endsWith("\n") ? "\n\n" : ""; + const separator = current && !current.endsWith("\n") ? joinSeparator : ""; if ( current && current.length + separator.length + part.length > targetSize @@ -294,8 +1106,186 @@ function mergeSmallParts(parts: string[], targetSize: number): string[] { return merged; } +/** + * Is an inline code span still OPEN at the END of `text`? Used by the + * inline-code-balance guards (overlap rebalance, single-line tail rebalance, and + * the line-split fallback grouping) to decide whether a unit/overlap window would + * leave an unbalanced inline-code delimiter in served/embedded chunk text. + * + * This replaces the old `backtickCount(...) % 2` parity test, which was wrong on + * two counts and produced two load-bearing bugs: + * + * - FENCE-UNAWARE: a parity count includes a code-FENCE delimiter (```` ``` ```` + * = 3 backticks, ODD). When an overlap window's retained lines contain a + * ```` ``` ```` line that is INTERIOR to a balanced fence (e.g. a `~~~`-wrapped + * ```` ``` ```` block — the real-Markdown way to DISPLAY a fence delimiter), + * the parity guard misclassified it as an open inline span and dropped up to + * and including it, severing the surrounding fence's OPENING delimiter and + * leaving its CLOSING delimiter as a phantom opener → a HALF-OPEN fence. + * - PARITY-ONLY (not exact-run-length): a double-backtick `` `` … `` `` span + * (CommonMark §6.1, used when inline code itself contains a backtick) puts 2 + * backticks on its opening line → parity EVEN immediately → the span was + * flushed/severed mid-way, landing its opener and closer in adjacent chunks. + * + * So this helper is BOTH fence-aware (inline backticks are only counted OUTSIDE + * fenced regions, via the SHARED matchFenceOpen / isFenceClose predicates) AND + * exact-run-length (a run of N backticks opens a span closed only by a later run + * of EXACTLY N — mirroring maskInlineCode). It returns true iff some inline run + * is left open when the text ends. The test oracle re-derives this same + * CommonMark rule INDEPENDENTLY (it must not import production, so a production + * bug cannot hide), but all three PRODUCTION guard sites funnel through this one + * helper so they cannot drift from each other. + */ +function inlineCodeOpenAtEnd(text: string): boolean { + // Mask out fenced regions first: their backticks are fence delimiters or + // verbatim in-fence code, NOT inline-code spans. Re-derive fence state + // line-by-line via the shared predicates so this agrees with segmentCodeBlocks. + let fence: FenceMarker | null = null; + const scanLines: string[] = []; + for (const line of text.split("\n")) { + if (fence) { + if (isFenceClose(line, fence)) fence = null; + scanLines.push(""); // in-fence content masked out + continue; + } + const m = matchFenceOpen(line); + if (m) { + fence = m; + scanLines.push(""); // fence-delimiter line masked out + continue; + } + scanLines.push(line); + } + // A line-split unit can END inside an unclosed fence (the closing delimiter is + // in a later unit); that is a FENCE imbalance, handled by the fence guards, not + // an inline-code one. Treat the in-fence remainder as masked (already "" above). + const s = scanLines.join("\n"); + + // Pair inline backtick runs by EXACT length over the masked text. A run of N + // opens a span that only a later run of EXACTLY N closes; a run of a different + // length inside an open span is literal content. If a run is left open when the + // text ends, an inline span is unclosed. + let i = 0; + const n = s.length; + while (i < n) { + if (s[i] !== "`") { + i++; + continue; + } + let runEnd = i; + while (runEnd < n && s[runEnd] === "`") runEnd++; + const runLen = runEnd - i; + let scan = runEnd; + let close = -1; + while (scan < n) { + if (s[scan] !== "`") { + scan++; + continue; + } + let candEnd = scan; + while (candEnd < n && s[candEnd] === "`") candEnd++; + if (candEnd - scan === runLen) { + close = scan; + break; + } + scan = candEnd; + } + if (close === -1) return true; // opener with no exact-length closer: span open + i = close + runLen; + } + return false; +} + +/** + * Take a word-boundary-snapped tail of a SINGLE-LINE chunk for overlap, honoring + * `maxChars` without exceeding it and without beginning mid-word. + * + * The previous chunk for the dominant markdown shape (a prose paragraph) is ONE + * physical line, so there is no newline boundary to cut on. The old code dropped + * the overlap entirely in that case, making overlap a no-op for the most common + * content. Pass the FULL `line` (the whole previous chunk): we take its last + * `maxChars` as the candidate window, and ONLY when that slice cut THROUGH a word + * — i.e. the character immediately BEFORE the window is non-whitespace, so the + * window's first character is the tail of a partial word — do we snap the START + * FORWARD past the partial leading word to the next word boundary (never "phabeta" + * from a cut through "alphabeta"). When the slice happens to land exactly on a + * word boundary (the preceding char is whitespace, or the whole line is ≤ + * maxChars), the window already begins on a whole word and is kept intact, so a + * complete leading word is never dropped by the word-boundary snap + * (`wordBoundaryTail("aa bb cc", 5)` ⇒ "bb cc", not "cc"). If the snapped tail + * has an unbalanced backtick count (it begins/ends inside an inline code span), + * rebalance by dropping everything up to and including the first backtick run so + * the prepended overlap can never open an unclosed inline span — guard (b). This + * is a best-effort rebalance: it may discard a COMPLETE leading word that + * precedes the first backtick (not merely a partial code fragment). Returns "" + * when no safe word-boundary tail remains. + */ +function wordBoundaryTail(line: string, maxChars: number): string { + if (maxChars <= 0) return ""; + const window = line.slice(-maxChars); + // A genuine mid-word cut means the char IMMEDIATELY BEFORE the retained window + // (`line[line.length - maxChars - 1]`) is non-whitespace: the slice severed a + // word, so `window` starts on a partial word. Only then advance past the + // partial leading word (to the next whitespace, then past the whitespace) so + // the tail starts on a whole word. When the preceding char is whitespace (or + // the line is no longer than the window, so there is no preceding char), the + // window already begins on a word boundary and is kept verbatim. + let start = 0; + const cutMidWord = /\S/.test(line[line.length - maxChars - 1] ?? " "); + if (cutMidWord) { + while (start < window.length && /\S/.test(window[start])) start++; + while (start < window.length && /\s/.test(window[start])) start++; + } + let tail = window.slice(start).trim(); + if (!tail) return ""; + + // Guard (b): never prepend a tail that leaves an inline-code span open. The + // balance test is EXACT-RUN-LENGTH (inlineCodeOpenAtEnd), not a backtick parity: + // a parity test treated a double-backtick `` `` … `` `` fragment as balanced + // (2 backticks ⇒ even) even when only its opener or only its closer is present. + // If the tail leaves a span open, drop everything up to and including the first + // backtick run to rebalance. This best-effort rebalance may discard a COMPLETE + // leading word before that first run, not just a partial code fragment. If a + // span is still open after the drop, bail. (wordBoundaryTail operates on a + // SINGLE physical line, so there is no fenced region here — the fence-awareness + // of inlineCodeOpenAtEnd is inert on this path and only the exact-run-length + // pairing matters.) + if (inlineCodeOpenAtEnd(tail)) { + const firstTick = tail.indexOf("`"); + if (firstTick >= 0) { + let after = firstTick; + while (after < tail.length && tail[after] === "`") after++; + tail = tail.slice(after).trim(); + } + if (inlineCodeOpenAtEnd(tail)) return ""; + } + return tail; +} + /** * Apply overlap between consecutive chunks. + * + * The overlap is always joined to the next chunk with a BLANK-LINE separator + * (`\n\n`) so the next chunk's content stays at line-start — this is what keeps a + * leading heading on its own line and prevents "Word## Heading" fusion even when + * the overlap is a partial line. Two shapes are handled: + * + * - the window contains a newline: snap the START to the FIRST newline in the + * window and keep all WHOLE lines after it, so the overlap is ≈ overlapChars + * (not just the final line). If those lines contain a half-open fence, fall + * back to the last line only (the prior behavior) to preserve fence integrity. + * - the window has NO newline (single-line prose, the dominant shape): take a + * word-boundary-snapped tail (see wordBoundaryTail) so overlap is actually + * applied instead of dropped. + * + * GUARDS: (a) a fence-delimiter overlap line is dropped (matchFenceOpen) so + * overlap never opens a fence; (b) an unbalanced inline-code tail is rebalanced + * or dropped — in BOTH the single-line path (wordBoundaryTail) and the + * multi-line path (a dedicated inline-backtick rebalance, since a retained + * multi-line window can begin inside a span; the fence-balance fallback does NOT + * cover inline backticks); the heading-path binding uses the PRE-overlap + * rawChunks[i] in chunkMarkdown, so it is unaffected by what is prepended here + * (guard c). */ function applyOverlap(chunks: string[], overlapChars: number): string[] { if (chunks.length <= 1 || overlapChars <= 0) return chunks; @@ -305,33 +1295,159 @@ function applyOverlap(chunks: string[], overlapChars: number): string[] { const prevChunk = chunks[i - 1]; const overlapText = prevChunk.slice(-overlapChars); - // Find a clean break point (newline or space) in the overlap - const breakPoint = overlapText.lastIndexOf("\n"); - const cleanOverlap = - breakPoint > 0 ? overlapText.slice(breakPoint) : overlapText; + let cleanOverlap: string; + const firstNl = overlapText.indexOf("\n"); + if (firstNl >= 0) { + // Retain all WHOLE lines from the first newline boundary in the window so + // the overlap approximates overlapChars rather than only the final line. + cleanOverlap = overlapText.slice(firstNl + 1).trimEnd(); + // Guard (d): if retaining multiple lines introduces a half-open fence + // (the window started inside a fenced block, or ends on a lone opener), + // fall back to the LAST line only — the conservative prior behavior. + if (!fenceBalancedLines(cleanOverlap)) { + const lastNl = overlapText.lastIndexOf("\n"); + cleanOverlap = overlapText.slice(lastNl + 1).trimEnd(); + } + // Guard (b), multi-line: the retained window can BEGIN inside an INLINE + // code span (the slice from the first newline started mid-span), so the + // retained lines may leave an inline span open — prepending them would open + // an inline span the next chunk never closes in served text. The balance + // test is FENCE-AWARE + EXACT-RUN-LENGTH (inlineCodeOpenAtEnd), NOT a raw + // backtick parity. This is load-bearing here: a parity count includes a + // ```` ``` ```` fence delimiter that is INTERIOR to a balanced fence (e.g. a + // `~~~`-wrapped ```` ``` ```` block) — odd parity — so the old guard + // misclassified it as an open inline span and dropped up to and including + // it, SEVERING the surrounding fence's opening delimiter and leaving its + // closing delimiter as a phantom opener (a HALF-OPEN fence). Because + // inlineCodeOpenAtEnd masks fenced regions, that interior delimiter is no + // longer mistaken for inline, and parity-even multi-backtick spans are + // caught by exact-run-length. When a span IS genuinely open: drop everything + // up to and including the first backtick run to rebalance; if still open, + // fall back to the LAST line only; if THAT still leaves a span open, drop + // the overlap entirely. + if (inlineCodeOpenAtEnd(cleanOverlap)) { + const firstTick = cleanOverlap.indexOf("`"); + if (firstTick >= 0) { + let after = firstTick; + while (after < cleanOverlap.length && cleanOverlap[after] === "`") + after++; + cleanOverlap = cleanOverlap.slice(after).trimEnd(); + } + if (inlineCodeOpenAtEnd(cleanOverlap)) { + const lastNl = overlapText.lastIndexOf("\n"); + cleanOverlap = overlapText.slice(lastNl + 1).trimEnd(); + } + if (inlineCodeOpenAtEnd(cleanOverlap)) { + cleanOverlap = ""; + } + } + // Guard (b) belt-and-suspenders: ANY inline-backtick rebalance drop above + // could, in a pathological interleaving, have removed a fence delimiter and + // thereby UNbalanced the retained fences (the exact A1 failure mode if the + // fence-aware count ever under-counts). Re-check fence balance after the + // drop; if it broke, fall back to the LAST line only, and if even that is + // fence-unbalanced, drop the overlap entirely. A guard-(b) drop can then + // never emit a half-open fence. + if (!fenceBalancedLines(cleanOverlap)) { + const lastNl = overlapText.lastIndexOf("\n"); + cleanOverlap = overlapText.slice(lastNl + 1).trimEnd(); + if (!fenceBalancedLines(cleanOverlap)) { + cleanOverlap = ""; + } + } + } else { + // Single physical line: take a word-boundary tail so overlap is applied. + // Pass the FULL previous chunk (not the pre-sliced window): wordBoundaryTail + // slices its own last-overlapChars window AND inspects the char just before + // it to detect a real mid-word cut. Passing the already-sliced overlapText + // would make that look-back read the window's own first char, so the snap + // could never fire and the overlap would begin mid-word. + cleanOverlap = wordBoundaryTail(prevChunk, overlapChars); + } + + // Guard (a): when the (last) overlap line is itself a code-fence delimiter — + // the previous chunk ENDS with an opening/closing fence (CommonMark: 0–3 + // leading spaces then a run of ≥3 backticks or ≥3 tildes) — prepending it + // would OPEN a fence in the next chunk that never closes, corrupting the + // embedded/served chunk text. Checks the LAST line so a multi-line overlap + // whose final line is a lone fence opener is also caught. For conservative + // simplicity this drops the ENTIRE prepended overlap (not just the offending + // final line), so in the multi-line branch several preceding prose lines are + // discarded along with the fence delimiter. + const lastLine = cleanOverlap.slice(cleanOverlap.lastIndexOf("\n") + 1); + if (matchFenceOpen(lastLine)) { + cleanOverlap = ""; + } - result.push(cleanOverlap + chunks[i]); + result.push(cleanOverlap ? `${cleanOverlap}\n\n${chunks[i]}` : chunks[i]); } return result; } +/** + * Is the fence state balanced (every opened fence closed) at the END of `text`? + * Used by applyOverlap's multi-line retention guard so a retained overlap window + * that began inside a fenced block (or ends on a lone opener) is not prepended + * with a half-open fence. Re-derives state line-by-line via the SHARED fence + * predicates (matchFenceOpen / isFenceClose), so it agrees with segmentCodeBlocks. + */ +function fenceBalancedLines(text: string): boolean { + let open: FenceMarker | null = null; + for (const line of text.split("\n")) { + if (open) { + if (isFenceClose(line, open)) open = null; + } else { + const m = matchFenceOpen(line); + if (m) open = m; + } + } + return open === null; +} + /** * Split markdown/MDX content into embedding-friendly chunks. * * @param content - The full markdown/MDX file content * @param filePath - Path to the source file (used for metadata) - * @returns Array of MarkdownChunk objects + * @param config - Source configuration (chunk sizing, etc.) + * @param absoluteFilePath - Absolute filesystem path of the source file, when + * available. Used to resolve and inline MDX `@/snippets/*` imports before + * stripping. Falls back to `filePath` when that is itself absolute. When no + * absolute path is available, snippet inlining is skipped. + * @returns Array of ChunkOutput objects */ export function chunkMarkdown( content: string, filePath: string, config: SourceConfig, + absoluteFilePath?: string, ): ChunkOutput[] { if (!content || !content.trim()) { return []; } + // Normalize line endings to LF ONCE, before any parsing/stripping/detection. + // The single-line heading/fence predicates use `$` and `.` (which do not match + // `\r`, a JS line terminator) and the chunker splits lines on `\n` only, so a + // CRLF (Windows / core.autocrlf) document would otherwise leave a trailing + // `\r` on every line: HEADING_LINE_RE fails on "## H\r", isFenceClose fails on + // "```\r" (fence runs to EOF), and every chunk degrades to title=filename / + // headingPath=[]. chunkMarkdown is the registered chunker for BOTH "markdown" + // and "notion" sources with no upstream normalization, so it must do this. All + // downstream `indexOf` then operates on this normalized content, keeping the + // verbatim-substring invariant consistent. + content = content.replace(/\r\n?/g, "\n"); + + // Strip the 4 Private-Use-Area sentinels (U+E000–U+E003) that mask inline-code + // spans and heading lines during the strip passes. They cannot appear in real + // markdown, but a hostile/exotic source containing the literal code points + // would otherwise collide with our placeholders — a masked span could restore + // to the wrong text, or a literal sentinel-shaped sequence could survive into a + // served chunk. Remove them ONCE here, before any masking, so the placeholder + // namespace is exclusively ours downstream. + content = content.replace(/[\u{E000}-\u{E003}]/gu, ""); + const targetChars = (config.chunk?.target_tokens ?? DEFAULT_TARGET_TOKENS) * 4; const overlapChars = @@ -340,8 +1456,31 @@ export function chunkMarkdown( // Parse frontmatter const { title: fmTitle, body } = parseFrontmatter(content); + // Inline MDX snippet imports (@/snippets/*) before stripping, so + // snippet-composed pages index with their real content instead of empty. + // Prefer an explicit absolute path; fall back to filePath when it is already + // absolute. inlineSnippetImports safely no-ops on non-absolute paths. + // + // Re-apply BOTH host normalizations (CRLF→LF and the PUA-sentinel strip) to + // the inlined body. inlineSnippetImports reads each snippet file RAW from disk + // (fs.readFileSync, no line-ending normalization, no sentinel strip) and + // injects those bytes AFTER the line-1310/1319 passes already ran on the host + // content, and stripMdx below does not touch `\r` or the sentinels. Without + // re-normalizing here, a CRLF- or PUA-authored snippet would bypass both host + // passes: a trailing `\r` makes the single-line heading/fence predicates (`$` + // and `.` do not match `\r`) fail on the inlined snippet lines — degrading + // that snippet's headingPath to [] and the title to the filename, and leaking + // `\r` into served/embedded content — while a literal U+E000–U+E003 sentinel + // would survive into the masking passes and break the "placeholder namespace + // is exclusively ours downstream" guarantee. Normalizing the inlined body the + // same way the host content was keeps the whole post-inline body uniform. + const snippetBasePath = absoluteFilePath ?? filePath; + const inlinedBody = inlineSnippetImports(body, snippetBasePath) + .replace(/\r\n?/g, "\n") + .replace(/[\u{E000}-\u{E003}]/gu, ""); + // Strip MDX syntax - const cleanBody = stripMdx(body); + const cleanBody = stripMdx(inlinedBody); if (!cleanBody.trim()) { return []; @@ -365,17 +1504,52 @@ export function chunkMarkdown( let searchFrom = 0; for (let i = 0; i < overlappedChunks.length; i++) { - const chunkText = overlappedChunks[i].trim(); + // Trim chunk edges WITHOUT promoting a 4-space-indented doc-start fence to a + // column-0 fence. A plain .trim() strips the leading indentation of a chunk + // whose first content line is 4-space-indented (CommonMark INDENTED CODE) — + // e.g. a doc that OPENS with ` ```lang` — turning it into a COLUMN-0 + // ` ```lang ` whose still-indented closing ` ``` ` no longer closes it, + // leaving a half-open fence in the SERVED chunk text. trimChunkEdges strips a + // 0–3-space (cosmetic) leading indent as before but PRESERVES a 4+-space + // (semantic, indented-code) one, so the 0–3 vs 4+ space rule keeps governing. + const chunkText = trimChunkEdges(overlappedChunks[i]); if (!chunkText) continue; // Find the position of this chunk's primary content in the clean body - // Use the raw (non-overlapped) chunk to find position - const rawText = rawChunks[i]?.trim() || chunkText; + // Use the raw (non-overlapped) chunk to find position. Trim the same way as + // chunkText so the verbatim-substring indexOf binding stays consistent (both + // remain contiguous substrings of cleanBody). The `|| chunkText` fallback + // guards an all-blank rawChunks[i] (trimChunkEdges → "") so rawText is never + // the empty string (which indexOf would "find" at searchFrom and mis-bind). + const rawText = (rawChunks[i] && trimChunkEdges(rawChunks[i])) || chunkText; const pos = cleanBody.indexOf(rawText, searchFrom); const headingPath = pos >= 0 ? getHeadingPathAtPosition(cleanBody, pos) : []; if (pos >= 0) { - searchFrom = pos; + // Advance past the matched chunk. Using `pos` alone leaves the cursor at + // the start of this match, so when a later chunk has byte-identical text + // (repeated boilerplate / duplicate sections) the next indexOf re-finds + // THIS position and the later chunk inherits the wrong heading path. + searchFrom = pos + rawText.length; + } else { + // The chunk text was expected to be a verbatim substring of cleanBody + // (that invariant is what lets indexOf bind the heading path). When it is + // not, headingPath silently degrades to [] and the embedded retrieval + // anchor is lost — warn loudly so a future break of the invariant is + // visible rather than quietly degrading search quality. The index reported + // is the RAW chunk loop index `i` (chunks.length skips empty chunks via + // the `continue` above, so it is NOT the raw position of this chunk). + console.warn( + `[chunker] heading-path lookup failed for ${filePath} chunk ${i}: ` + + `chunk text is not a verbatim substring of the cleaned body; headingPath degraded to []`, + ); + // Still advance the cursor past this chunk's length. Leaving searchFrom + // unmoved on a miss lets a LATER chunk with byte-identical text re-bind an + // EARLIER occurrence's heading path (the duplicate-text cascade the hit + // branch above guards). Advancing by rawText.length keeps the cursor + // monotonically ahead so a subsequent duplicate is matched at its own + // (later) position rather than re-finding a stale earlier one. + searchFrom += rawText.length; } chunks.push({ diff --git a/src/indexing/chunking/snippets.ts b/src/indexing/chunking/snippets.ts new file mode 100644 index 0000000..83dac18 --- /dev/null +++ b/src/indexing/chunking/snippets.ts @@ -0,0 +1,258 @@ +// Inline MDX snippet imports before chunking. +// +// CopilotKit-style docs compose pages from shared snippets, e.g.: +// +// import MigrateToV2 from "@/snippets/shared/troubleshooting/migrate-to-v2.mdx"; +// +// +// The chunker's stripMdx() removes both the `import` line and the `` +// JSX, so snippet-composed pages index as nearly empty. This module resolves +// those imports against the docs source tree and inlines the snippet body into +// the host page *before* stripping, so the real content gets chunked and +// indexed. Snippets may themselves import snippets, so resolution recurses with +// a bounded depth and a cycle guard. + +import fs from "node:fs"; +import path from "node:path"; + +/** How many levels of snippet-importing-snippet to follow. */ +const DEFAULT_MAX_DEPTH = 3; +/** How far up the tree to look for the `@/` alias root (the `snippets/` parent). */ +const MAX_ALIAS_LOOKUP_DEPTH = 12; + +export interface InlineSnippetOptions { + /** Maximum recursion depth for nested snippet imports. */ + maxDepth?: number; +} + +interface ImportDecl { + /** Local name the snippet is bound to (the JSX component name). */ + name: string; + /** Raw module specifier, e.g. "@/snippets/foo.mdx" or "./foo.mdx". */ + spec: string; + /** The full matched import statement text (for removal). */ + raw: string; +} + +/** Matches `import Name from "spec";` (single-line ESM default import). */ +const IMPORT_RE = + /^import\s+([A-Za-z_$][A-Za-z0-9_$]*)\s+from\s+['"]([^'"]+)['"];?\s*$/gm; + +/** + * Strip leading YAML frontmatter from an MDX snippet body. Snippets normally + * have no frontmatter, but stripping defensively keeps inlined output clean. + */ +function stripFrontmatter(content: string): string { + const match = content.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/); + return match ? content.slice(match[0].length) : content; +} + +/** + * Resolve the directory that the `@/` alias maps to for a given host file. + * + * In CopilotKit's docs the alias is configured (tsconfig `paths`) as + * `@/* -> ./*` relative to the docs project root, which is the directory that + * contains `snippets/`. Rather than hard-code the repo layout, walk up from the + * host file until we find an ancestor that contains a `snippets/` directory. + * Returns null if none is found within a bounded number of levels. + */ +function findAliasRoot(hostDir: string): string | null { + let dir = hostDir; + for (let i = 0; i < MAX_ALIAS_LOOKUP_DEPTH; i++) { + try { + const candidate = path.join(dir, "snippets"); + if (fs.existsSync(candidate) && fs.statSync(candidate).isDirectory()) { + return dir; + } + } catch { + // Ignore stat errors and keep walking up. + } + const parent = path.dirname(dir); + if (parent === dir) break; // reached filesystem root + dir = parent; + } + return null; +} + +/** + * Resolve a module specifier to an absolute file path on disk, or null if it + * does not point at a snippet we can inline. + */ +function resolveSpec( + spec: string, + hostDir: string, + aliasRoot: string | null, +): string | null { + // Only inline MDX/markdown snippet imports; ignore component/code imports. + if (!/\.mdx?$/.test(spec)) return null; + + let abs: string; + if (spec.startsWith("@/")) { + if (!aliasRoot) return null; + abs = path.join(aliasRoot, spec.slice(2)); + } else if (spec.startsWith("./") || spec.startsWith("../")) { + abs = path.resolve(hostDir, spec); + } else { + // Bare package import (e.g. a real npm module) — not a local snippet. + return null; + } + + try { + if (fs.existsSync(abs) && fs.statSync(abs).isFile()) return abs; + } catch { + return null; + } + return null; +} + +/** + * Parse single-line default-import declarations from MDX content. + * + * De-duplicates by the raw matched line: an identical import line appearing + * more than once yields a single decl, so the snippet is resolved + inlined + * once. (Removal of the line itself is global — see removeAll — so every copy + * is stripped regardless of how many decls were parsed.) + */ +function parseImports(content: string): ImportDecl[] { + const decls: ImportDecl[] = []; + const seenRaw = new Set(); + let match: RegExpExecArray | null; + IMPORT_RE.lastIndex = 0; + while ((match = IMPORT_RE.exec(content)) !== null) { + const raw = match[0]; + if (seenRaw.has(raw)) continue; + seenRaw.add(raw); + decls.push({ name: match[1], spec: match[2], raw }); + } + return decls; +} + +/** Escape a string for safe use inside a RegExp. */ +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +/** + * Remove EVERY occurrence of a literal import line from `content`. + * + * String#replace(string, …) only removes the first match, so a duplicated + * identical import line would leave a dangling import behind. Build a global + * regex from the escaped literal so all copies are stripped. + */ +function removeAll(content: string, literal: string): string { + return content.replace(new RegExp(escapeRegExp(literal), "g"), ""); +} + +/** + * Replace JSX usages of `name` (self-closing `` and paired + * `...`) with `replacement`. + */ +function replaceUsages( + content: string, + name: string, + replacement: string, +): string { + const n = escapeRegExp(name); + let result = content; + + // Self-closing: — replaced FIRST so a self-closing tag can never + // be consumed as the opening tag of the paired match below (which would + // otherwise swallow everything up to the next , deleting any content + // in between two uses of the same snippet). + const selfRe = new RegExp(`<${n}(?:\\s+[^>]*)?\\s*/>`, "g"); + result = result.replace(selfRe, () => replacement); + + // Paired: ... (snippet bodies normally render via the + // self-closing form, but handle the wrapping form too). Inner content is + // discarded in favor of the inlined snippet body. The opening tag's attribute + // run must not end in `/` (`[^>]*[^/>]`), so the pattern cannot match a + // self-closing `` form even if the self-closing pass above left + // one behind. + const pairedRe = new RegExp( + `<${n}(?:\\s+[^>]*[^/>])?>[\\s\\S]*?<\\/${n}>`, + "g", + ); + result = result.replace(pairedRe, () => replacement); + + return result; +} + +/** + * Inline MDX snippet imports into `content`. + * + * @param content - The MDX/markdown source of the host file. + * @param hostAbsPath - Absolute filesystem path of the host file. Used to + * resolve the `@/` alias and relative snippet specifiers. If not absolute (or + * the alias root can't be located), snippet inlining is skipped and the + * original content is returned unchanged. + * @param opts - Optional behavior overrides. + * @returns The content with resolvable snippet imports inlined. + */ +export function inlineSnippetImports( + content: string, + hostAbsPath: string | undefined, + opts: InlineSnippetOptions = {}, +): string { + if (!content || !hostAbsPath || !path.isAbsolute(hostAbsPath)) { + return content; + } + const maxDepth = opts.maxDepth ?? DEFAULT_MAX_DEPTH; + return inlineRecursive(content, hostAbsPath, maxDepth, new Set()); +} + +function inlineRecursive( + content: string, + hostAbsPath: string, + depthRemaining: number, + visited: Set, +): string { + if (depthRemaining <= 0) return content; + + const decls = parseImports(content); + if (decls.length === 0) return content; + + const hostDir = path.dirname(hostAbsPath); + const aliasRoot = findAliasRoot(hostDir); + + let result = content; + + for (const decl of decls) { + const snippetAbs = resolveSpec(decl.spec, hostDir, aliasRoot); + if (!snippetAbs) continue; // not a local snippet (or missing) — leave as-is + + // Cycle guard: never inline a file already on the current resolution path. + if (visited.has(snippetAbs)) { + // Drop the import + usage so cyclic refs don't leave dangling JSX, but + // do not recurse again into the cycle. + result = replaceUsages(result, decl.name, ""); + result = removeAll(result, decl.raw); + continue; + } + + let snippetBody: string; + try { + snippetBody = fs.readFileSync(snippetAbs, "utf-8"); + } catch { + continue; // unreadable — leave the original import/usage untouched + } + + snippetBody = stripFrontmatter(snippetBody); + + // Recurse into the snippet so nested snippets get inlined too. + const nextVisited = new Set(visited); + nextVisited.add(snippetAbs); + const resolvedBody = inlineRecursive( + snippetBody, + snippetAbs, + depthRemaining - 1, + nextVisited, + ); + + // Inline the (recursively resolved) body wherever the component is used, + // then remove the now-unused import line (all copies, not just the first). + result = replaceUsages(result, decl.name, `\n\n${resolvedBody.trim()}\n\n`); + result = removeAll(result, decl.raw); + } + + return result; +} diff --git a/src/indexing/orchestrator.ts b/src/indexing/orchestrator.ts index 06116e9..ca737e8 100644 --- a/src/indexing/orchestrator.ts +++ b/src/indexing/orchestrator.ts @@ -768,11 +768,46 @@ export class IndexingOrchestrator { result = await provider.fullAcquire(); } + // Collect per-item failures from both the remove and index passes. A + // failed item/delete must NOT advance the state token past it: the + // pipeline swallows the individual error to keep the batch resilient, + // but if we then persist the new token the failed item falls behind it + // and is never re-processed (permanent silent loss). When anything + // failed we leave the prior token in place and mark the run errored so + // the next incremental run reprocesses the failed items. + const failedIds: string[] = []; if (result.removedIds.length > 0) { - await pipeline.removeItems(result.removedIds); + const { failedIds: removeFailed } = await pipeline.removeItems( + result.removedIds, + ); + failedIds.push(...removeFailed); } if (result.items.length > 0) { - await pipeline.indexItems(result.items, result.stateToken); + const { failedIds: indexFailed } = await pipeline.indexItems( + result.items, + result.stateToken, + ); + failedIds.push(...indexFailed); + } + + if (failedIds.length > 0) { + // Do NOT advance last_commit_sha — setIndexStatus preserves the + // prior token, so the next incremental run re-diffs from where we + // were and reprocesses the items that failed this run. Return false + // so the caller treats this source as NOT successfully reindexed: + // it is excluded from affectedSourceNames, so onReindexComplete and + // the Atlas cache invalidation only fire for sources that fully + // succeeded. + console.error( + `[orchestrator] Indexing for ${sourceConfig.name} had ${failedIds.length} failed item(s); holding state token for retry: ${failedIds.slice(0, 10).join(", ")}${failedIds.length > 10 ? " …" : ""}`, + ); + await this.setIndexStatus( + sourceConfig.type, + sourceConfig.name, + "error", + `${failedIds.length} item(s) failed to index/remove; state token held for retry`, + ); + return false; } await upsertIndexState({ diff --git a/src/indexing/pipeline.ts b/src/indexing/pipeline.ts index 2f7a05f..b98eab8 100644 --- a/src/indexing/pipeline.ts +++ b/src/indexing/pipeline.ts @@ -3,7 +3,7 @@ import { getChunker } from "./chunking/index.js"; import { deriveUrl } from "./url-derivation.js"; import type { EmbeddingProvider } from "./embeddings.js"; -import { upsertChunks, deleteChunksByFile } from "../db/queries.js"; +import { replaceChunksForFile, deleteChunksByFile } from "../db/queries.js"; import { isFileSourceConfig } from "../types.js"; import type { Chunk, SourceConfig } from "../types.js"; import type { ContentItem } from "./providers/types.js"; @@ -23,25 +23,55 @@ export class IndexingPipeline { } /** - * Index a batch of content items: chunk → embed → upsert. - * Each item's existing chunks are deleted first to handle shrinkage. + * Index a batch of content items: chunk → embed → upsert. Each item is + * replaced atomically via {@link replaceChunksForFile} (delete + insert in a + * single transaction), so a failed insert never leaves an item's pre-existing + * chunks deleted-but-not-replaced. + * + * A single item's failure must not abort the batch (the remaining items still + * index), but it MUST be surfaced: the returned `failedIds` lists every item + * whose `indexItem` threw. The caller uses this to avoid advancing the index + * state token past items that did not actually index — otherwise a failed + * item falls behind the advanced token and is never re-processed (permanent + * silent data loss). */ - async indexItems(items: ContentItem[], stateToken: string): Promise { + async indexItems( + items: ContentItem[], + stateToken: string, + ): Promise<{ failedIds: string[] }> { + const failedIds: string[] = []; for (const item of items) { try { await this.indexItem(item, stateToken); } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`${this.logPrefix} Failed to index ${item.id}: ${msg}`); + // Log the full error (not just err.message) so the stack + any + // pg-level metadata survives for diagnosis; collect the id so the + // caller can hold the state token back. + console.error(`${this.logPrefix} Failed to index ${item.id}:`, err); + failedIds.push(item.id); } } + return { failedIds }; } - /** Remove items from the index by ID. */ - async removeItems(ids: string[]): Promise { + /** + * Remove items from the index by ID. Mirrors {@link indexItems}: a single + * failing delete must not abort the batch, so the remaining ids are still + * processed — but the failed ids are RETURNED so the caller does not advance + * the index state token over items whose stale chunks are still in the index. + */ + async removeItems(ids: string[]): Promise<{ failedIds: string[] }> { + const failedIds: string[] = []; for (const id of ids) { - await deleteChunksByFile(this.sourceConfig.name, id); + try { + await deleteChunksByFile(this.sourceConfig.name, id); + } catch (err) { + // Log the full error (not just err.message) so the stack survives. + console.error(`${this.logPrefix} Failed to remove ${id}:`, err); + failedIds.push(id); + } } + return { failedIds }; } private async indexItem( @@ -49,14 +79,39 @@ export class IndexingPipeline { stateToken: string, ): Promise { const chunker = getChunker(this.sourceConfig.type); - const chunkOutputs = chunker(item.content, item.id, this.sourceConfig); + const chunkOutputs = chunker( + item.content, + item.id, + this.sourceConfig, + item.absolutePath, + ); if (chunkOutputs.length === 0) { + // The item produced zero chunks. If it previously had chunks (and is + // routed through `items` rather than `removedIds`), early-returning here + // would leave those stale chunks in the index forever. Clear them via the + // delete-only path of replaceChunksForFile (empty array → DELETE, no + // INSERT). Harmless when the file never had chunks (the DELETE matches + // nothing). No embedding round-trip is needed since there's nothing to + // embed. + await replaceChunksForFile(this.sourceConfig.name, item.id, []); return; } - const texts = chunkOutputs.map((c) => c.content); + // Embed the chunk's title + heading path alongside its content so that + // precise symbol/prop/heading queries retain their strongest anchor. + // Code chunks (which may lack a title/heading) fall back to content only. + const texts = chunkOutputs.map((c) => + [c.title, c.headingPath?.join(" > "), c.content] + .filter(Boolean) + .join("\n"), + ); const embeddings = await this.embeddingProvider.embedBatch(texts); + if (embeddings.length !== texts.length) { + throw new Error( + `Embedding count mismatch for item ${item.id}: expected ${texts.length}, got ${embeddings.length}`, + ); + } const sourceUrl = item.sourceUrl ?? (isFileSourceConfig(this.sourceConfig) @@ -77,15 +132,26 @@ export class IndexingPipeline { end_line: chunk.endLine ?? null, language: chunk.language ?? null, chunk_index: chunk.chunkIndex, + // Spread item.metadata FIRST so the chunk-derived headingPath always + // wins: it is embedded into the vector above and is load-bearing for + // retrieval, so a provider's metadata.headingPath must not clobber it. + // Guard on `.length` (not mere truthiness): an empty array `[]` is truthy + // and would otherwise clobber a provider's headingPath with nothing. metadata: { - ...(chunk.headingPath ? { headingPath: chunk.headingPath } : {}), ...(item.metadata ?? {}), + ...(chunk.headingPath?.length + ? { headingPath: chunk.headingPath } + : {}), }, commit_sha: stateToken, version: this.sourceConfig.version ?? null, })); - await deleteChunksByFile(this.sourceConfig.name, item.id); - await upsertChunks(chunks); + // Atomic delete+insert: replaceChunksForFile runs the DELETE and the INSERTs + // on a single client inside one transaction. If any insert fails the whole + // operation rolls back, so the item's PRE-EXISTING chunks are never left + // deleted-but-not-replaced (indexItems swallows the error and the caller + // advances its state token, which would otherwise lose the chunks forever). + await replaceChunksForFile(this.sourceConfig.name, item.id, chunks); } } diff --git a/src/indexing/providers/file.ts b/src/indexing/providers/file.ts index 88ed55e..f4d9a3d 100644 --- a/src/indexing/providers/file.ts +++ b/src/indexing/providers/file.ts @@ -36,6 +36,29 @@ function authenticatedUrl(repoUrl: string, githubToken?: string): string { return repoUrl; } +/** + * Build the per-file contribution to a local source's state token. + * + * The token folds path + mtime + size together so that change detection + * triggers a re-index when *either* mtime *or* size changes. Including size + * (in addition to mtime) catches content edits that preserve mtime — e.g. + * `cp -p`, some `git checkout`/restore, and `rsync --times` — which an + * mtime-only token would silently miss, leaving stale content indexed. + * + * Remaining limitation: a content change that preserves *both* mtime *and* + * size (an in-place edit of equal length) is still undetected by this token. + * Hashing file content would close that gap but would require reading every + * file on each scan; size is the minimal correct improvement that avoids that + * cost. + */ +export function localFileHashInput( + relPath: string, + mtimeMs: number, + size: number, +): string { + return `${relPath}:${mtimeMs}:${size}\n`; +} + export class FileDataProvider implements DataProvider { private config: FileSourceConfig; private options: ProviderOptions; @@ -131,7 +154,7 @@ export class FileDataProvider implements DataProvider { this.config.type, ); if (hasLowSemanticValue(content)) continue; - items.push({ id: relPath, content, metadata }); + items.push({ id: relPath, absolutePath: absPath, content, metadata }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error(`${this.logPrefix} Failed to read ${relPath}: ${msg}`); @@ -207,9 +230,14 @@ export class FileDataProvider implements DataProvider { .split("\n") .map((f) => f.trim()) .filter((f) => f.length > 0); - const pathPrefix = this.config.path - ? this.config.path.replace(/\/$/, "") + "/" - : ""; + // Treat "." (and "") as "no prefix": a repo-root source walks from the + // repo root, and git-diff paths are repo-root-relative with NO leading + // "./". Deriving a "./" prefix from the truthy "." would filter out EVERY + // changed/deleted/renamed path, silently indexing nothing while advancing + // the state token. Mirrors the `path !== "."` guard in reindex-audit.ts. + const normPath = + this.config.path && this.config.path !== "." ? this.config.path : ""; + const pathPrefix = normPath ? normPath.replace(/\/$/, "") + "/" : ""; const scopedChanged = pathPrefix ? changedFiles.filter((f) => f.startsWith(pathPrefix)) : changedFiles; @@ -217,16 +245,13 @@ export class FileDataProvider implements DataProvider { .filter((f) => !f.split("/").some((seg) => this.skipDirs.has(seg))) .filter((f) => matchesPatterns(f, this.config)); - if (matchingChanged.length === 0) { - console.log(`${this.logPrefix} No matching changes detected`); - return { items: [], removedIds: [], stateToken: headSha }; - } - - console.log( - `${this.logPrefix} Incremental acquire: ${matchingChanged.length} changed files`, - ); - - // Find deleted files + // Find deleted/renamed files. This MUST run even when matchingChanged is + // empty: a commit whose only matching-relevant change is a rename of a + // MATCHED file to a NON-matched extension (e.g. `docs/a.md` → `docs/b.txt`) + // produces an empty `--name-only` match set (only the non-matching new path + // is listed) yet still removes `docs/a.md` from the index. Detecting + // removals before the no-matching-changes short-circuit upholds the "never + // silently advance the state token past a removal" guarantee below. let removedFiles: string[] = []; try { const diffStatusOutput = await git.diff([ @@ -252,25 +277,66 @@ export class FileDataProvider implements DataProvider { .filter((f) => matchesPatterns(f, this.config)); removedFiles = [...deletedFiles, ...renamedOldPaths]; } catch (err) { - console.warn( - `${this.logPrefix} git diff --name-status failed, skipping deletion detection:`, - err instanceof Error ? err.message : err, + // Deletion detection failed. Do NOT swallow it as `removedFiles = []`: + // the changed-files diff already succeeded, so the caller would advance + // the state token while silently leaving stale/deleted docs in the index + // forever (a transient git error masquerading as "no deletions"). Throw + // so the orchestrator marks the run errored and holds the prior token — + // the next incremental run re-diffs from the same point and re-detects + // the deletions. (The changed-files diff failing earlier legitimately + // falls back to fullAcquire, which does its own DB-vs-disk deletion + // detection; this branch is specifically the case where we KNOW there + // were changes but can't tell which were deletions.) + const msg = err instanceof Error ? err.message : String(err); + console.error( + `${this.logPrefix} git diff --name-status (deletion detection) failed:`, + err, ); + throw new Error(`${this.logPrefix} deletion detection failed: ${msg}`); + } + + if (matchingChanged.length === 0) { + // Genuine no-op only when there is also nothing to remove. When a rename + // out of the matched set leaves removals (removedFiles.length > 0), we + // must still process those deletions while advancing the token, rather + // than short-circuiting with removedIds: [] (which would strand the + // renamed-away file's chunks in the index forever). + if (removedFiles.length === 0) { + console.log(`${this.logPrefix} No matching changes detected`); + return { items: [], removedIds: [], stateToken: headSha }; + } + return { items: [], removedIds: removedFiles, stateToken: headSha }; } + console.log( + `${this.logPrefix} Incremental acquire: ${matchingChanged.length} changed files`, + ); + // Read changed (non-deleted) files const filesToRead = matchingChanged.filter( (f) => !removedFiles.includes(f), ); const items: ContentItem[] = []; - const skippedFiles: string[] = []; + // Files that should legitimately leave the index: size-exceeded and + // low-semantic-value content no longer belongs in the index, so it is safe + // (and correct) to fold these into removedIds. + const removedForContent: string[] = []; + // Read/extraction FAILURES on files that still exist on disk. These must NOT + // be deleted and must NOT let the state token advance over them — a + // transient EACCES/EIO/ENOMEM or an extractor parse error is not an + // intentional removal. Mirror the deletion-detection precedent above: throw + // after the loop so the orchestrator marks the run errored and holds the + // prior token, and the next incremental run re-diffs and retries the file. + // (Asymmetric-by-design with fullAcquire, which computes stale files from + // disk presence rather than post-extraction items for the same reason.) + const readFailures: string[] = []; for (const relPath of filesToRead) { const absPath = path.join(repoDir, relPath); if (!fs.existsSync(absPath)) continue; try { const stat = await fs.promises.stat(absPath); if (stat.size > this.maxFileSize) { - skippedFiles.push(relPath); + removedForContent.push(relPath); continue; } const { content, metadata } = await extractContent( @@ -278,20 +344,31 @@ export class FileDataProvider implements DataProvider { this.config.type, ); if (hasLowSemanticValue(content)) { - skippedFiles.push(relPath); + removedForContent.push(relPath); continue; } - items.push({ id: relPath, content, metadata }); + items.push({ id: relPath, absolutePath: absPath, content, metadata }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error(`${this.logPrefix} Failed to read ${relPath}: ${msg}`); - skippedFiles.push(relPath); + readFailures.push(relPath); } } + if (readFailures.length > 0) { + // Do NOT delete these files' chunks and do NOT advance the token: the + // files still exist on disk and only failed to read/extract this run. A + // later incremental diff won't re-list an unchanged file, so deleting now + // would permanently lose their chunks. Throw to hold the prior token for + // retry (matches the deletion-detection branch above). + throw new Error( + `${this.logPrefix} read/extraction failed for ${readFailures.length} changed file(s); holding state token for retry: ${readFailures.join(", ")}`, + ); + } + return { items, - removedIds: [...removedFiles, ...skippedFiles], + removedIds: [...removedFiles, ...removedForContent], stateToken: headSha, }; } @@ -425,9 +502,31 @@ export class FileDataProvider implements DataProvider { for (const f of files.sort()) { try { const stat = await fs.promises.stat(f); - hash.update(`${path.relative(walkRoot, f)}:${stat.mtimeMs}\n`); - } catch { - // File may have been deleted between walk and hash; skip it + // Fold path + mtime + size so a re-index triggers when either mtime + // or size changes. Including size catches mtime-preserving edits + // (cp -p, git checkout/restore, rsync --times); a same-mtime, + // same-size edit is the remaining undetected case. See + // localFileHashInput for the full rationale. + hash.update( + localFileHashInput( + path.relative(walkRoot, f), + stat.mtimeMs, + stat.size, + ), + ); + } catch (err) { + // ENOENT is the documented delete-after-walk race (the file vanished + // between walkFiles and this stat) — skip it silently. Any other error + // (EACCES, EIO, …) is a systemic stat failure that would silently skew + // the change-detection hash, so surface it via console.warn rather than + // swallowing it blind. + const code = (err as NodeJS.ErrnoException)?.code; + if (code !== "ENOENT") { + console.warn( + `${this.logPrefix} Unable to stat ${f} while hashing:`, + err instanceof Error ? err.message : err, + ); + } } } return `local-${hash.digest("hex").slice(0, 12)}`; diff --git a/src/indexing/providers/types.ts b/src/indexing/providers/types.ts index 3fd233e..f577af9 100644 --- a/src/indexing/providers/types.ts +++ b/src/indexing/providers/types.ts @@ -6,6 +6,12 @@ import type { SourceConfig } from "../../types.js"; export interface ContentItem { /** Unique identifier within the source (file path, thread ID, page ID, etc.) */ id: string; + /** + * Absolute filesystem path of the item's source file, when it has one. + * File-backed providers set this so chunkers can resolve sibling files + * (e.g. inlining MDX `@/snippets/*` imports). Non-file sources omit it. + */ + absolutePath?: string; /** Raw content to be chunked */ content: string; /** Human-readable title (optional — chunker may derive one) */ diff --git a/src/mcp/server.ts b/src/mcp/server.ts index 6c8e916..b556108 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -23,6 +23,12 @@ export function createMcpServer( telemetry?: BashTelemetry, workspace?: WorkspaceManager, hooks?: { onToolCall?: () => void }, + // Accessor for the per-session request-origin tag (user|synthetic|analysis) + // captured from the X-Pathfinder-Source header on the MCP init request. + // Threaded into the RAG tool handlers so each query_log row records who + // originated the traffic. Optional so existing callers/tests keep compiling; + // when absent the writer defaults the column to 'user'. + getRequestSource?: () => string | undefined, ): McpServer { const cfg = getConfig(); const serverCfg = getServerConfig(); @@ -55,6 +61,8 @@ export function createMcpServer( case "search": registerSearchTool(server, getEmbeddingProvider(), tool, { onToolCall: hooks?.onToolCall, + getSessionId, + getRequestSource, }); break; case "bash": { @@ -92,6 +100,8 @@ export function createMcpServer( case "knowledge": registerKnowledgeTool(server, getEmbeddingProvider(), tool, { onToolCall: hooks?.onToolCall, + getSessionId, + getRequestSource, }); break; default: { diff --git a/src/mcp/tools/knowledge.ts b/src/mcp/tools/knowledge.ts index eb01aa2..f049483 100644 --- a/src/mcp/tools/knowledge.ts +++ b/src/mcp/tools/knowledge.ts @@ -6,7 +6,11 @@ import type { FaqChunkResult, ChunkResult, } from "../../types.js"; -import { getFaqChunks, searchChunks } from "../../db/queries.js"; +import { + getFaqChunks, + getFaqChunksByIds, + searchChunks, +} from "../../db/queries.js"; import { logQuery } from "../../db/analytics.js"; import { getAnalyticsConfig } from "../../config.js"; @@ -21,7 +25,7 @@ export function formatFaqResults(results: FaqChunkResult[]): string { [ `Q&A ${i + 1}`, `QUESTION: ${r.title || "(untitled)"}`, - `ANSWER: ${extractAnswer(r.content)}`, + `ANSWER: ${extractAnswer(r.content, r.source_url || r.file_path)}`, `SOURCE: ${r.source_url || r.file_path}`, `CONFIDENCE: ${r.confidence.toFixed(2)}`, ].join("\n"), @@ -30,12 +34,25 @@ export function formatFaqResults(results: FaqChunkResult[]): string { } /** - * Extract the answer portion from Q&A content format "Q: ...\n\nA: ..." + * Extract the answer portion from Q&A content format "Q: ...\n\nA: ...". + * + * Also handles content that begins directly with the answer delimiter + * ("A: ..." with no preceding Q line / newline). When no delimiter is present + * at all, falls back to returning the full content (which may include the raw + * "Q:" text) and emits a `console.warn` so the leak is visible at the default + * log level (debug is suppressed in production). `chunkId` (a file_path or + * source_url) is included in the warning so the offending row is locatable. */ -function extractAnswer(content: string): string { - const match = content.match(/\nA:\s*([\s\S]*)/); +function extractAnswer(content: string, chunkId?: string): string { + // Prefer a delimiter on its own line ("...\nA: ..."), but also accept a + // leading "A:" at the very start of the content (no preceding newline). + const match = content.match(/(?:^|\n)A:\s*([\s\S]*)/); if (match) return match[1].trim(); - // Fallback: return full content + // Fallback: no answer delimiter found — return the full blob (leaks the Q: + // text + delimiters) and warn so we can spot malformed Q&A content. + console.warn( + `[knowledge] extractAnswer: no "A:" delimiter found${chunkId ? ` for ${chunkId}` : ""}; returning full content (len=${content.length})`, + ); return content; } @@ -47,7 +64,14 @@ export function registerKnowledgeTool( server: McpServer, embeddingClient: EmbeddingProvider, toolConfig: KnowledgeToolConfig, - options?: { onToolCall?: () => void }, + options?: { + onToolCall?: () => void; + // Per-session accessors resolved at call time — see registerSearchTool for + // the rationale. getSessionId persists session_id; getRequestSource + // persists the X-Pathfinder-Source origin tag on each query_log row. + getSessionId?: () => string | undefined; + getRequestSource?: () => string | undefined; + }, ): void { const inputSchema = { query: z @@ -84,7 +108,10 @@ export function registerKnowledgeTool( try { if (!query || query.trim() === "") { - // Browse mode: return all FAQ entries above confidence + // Browse mode: return the most-recent N FAQ entries above confidence + // (effectiveLimit caps the listing — getFaqChunks orders by + // indexed_at DESC and applies the LIMIT, so this is NOT "all" above + // confidence when more than `limit` qualify). const chunks = await getFaqChunks( toolConfig.sources, effectiveConfidence, @@ -101,7 +128,8 @@ export function registerKnowledgeTool( top_score: null, latency_ms: Date.now() - startMs, source_name: toolConfig.sources.join(","), - session_id: null, + session_id: options?.getSessionId?.() ?? null, + request_source: options?.getRequestSource?.() ?? null, }, analyticsConfig?.log_queries ?? true, ).catch((err) => { @@ -119,41 +147,57 @@ export function registerKnowledgeTool( // Search mode: embed query, search each source, merge, filter by confidence const embedding = await embeddingClient.embed(query); + // Over-fetch candidates per source so the confidence filter has a + // backfill pool. Fetching only `effectiveLimit` and slicing the top-N + // BEFORE filtering by confidence dropped below-confidence top-N hits + // with nothing to replace them, returning fewer than `limit` results + // even when more-confident hits existed just past the window. Pulling + // `effectiveLimit * 2` (mirrors candidateLimit = limit*2 in + // hybridSearchChunks), filtering, THEN slicing to `effectiveLimit` + // reaches `limit` whenever enough qualifying FAQ entries exist. + const candidateLimit = effectiveLimit * 2; + // Search each source independently and merge const allResults: ChunkResult[] = []; for (const sourceName of toolConfig.sources) { const results = await searchChunks( embedding, - effectiveLimit, + candidateLimit, sourceName, ); allResults.push(...results); } - // Sort by similarity descending, take top N + // Sort the full candidate pool by similarity descending. allResults.sort((a, b) => b.similarity - a.similarity); - const topResults = allResults.slice(0, effectiveLimit); - // Now get FAQ chunks (with confidence) for the same sources to cross-reference - // Use a very low confidence threshold (0) to get all, then filter - const faqChunks = await getFaqChunks( - toolConfig.sources, - 0, - effectiveLimit * 5, - ); + // Fetch FAQ metadata (with confidence) for EXACTLY the candidate ids. + // Looking up by id (vs an indexed_at-DESC top-N window) keeps every + // ranked hit so a relevant high-similarity hit is never dropped just + // because it falls outside a recency window. Skip the round-trip when + // there are no candidates to look up. + const faqChunks = + allResults.length > 0 + ? await getFaqChunksByIds(allResults.map((r) => r.id)) + : []; const faqById = new Map(faqChunks.map((c) => [c.id, c])); - // Merge: keep search results that have FAQ metadata and meet confidence threshold - const mergedResults: FaqChunkResult[] = []; - for (const result of topResults) { + // Merge: keep candidates that have FAQ metadata and meet the + // confidence threshold (preserving similarity order), THEN slice to + // `effectiveLimit`. Filtering before the slice is what lets a + // below-confidence top-N hit be backfilled by a more-confident + // deeper hit instead of leaving the result set short. + const qualifying: FaqChunkResult[] = []; + for (const result of allResults) { const faqChunk = faqById.get(result.id); if (faqChunk && faqChunk.confidence >= effectiveConfidence) { - mergedResults.push({ + qualifying.push({ ...faqChunk, similarity: result.similarity, }); } } + const mergedResults = qualifying.slice(0, effectiveLimit); // Fire-and-forget analytics logging const analyticsConfig = getAnalyticsConfig(); @@ -169,7 +213,8 @@ export function registerKnowledgeTool( top_score: topScore, latency_ms: Date.now() - startMs, source_name: toolConfig.sources.join(","), - session_id: null, + session_id: options?.getSessionId?.() ?? null, + request_source: options?.getRequestSource?.() ?? null, }, analyticsConfig?.log_queries ?? true, ).catch((err) => { diff --git a/src/mcp/tools/search.ts b/src/mcp/tools/search.ts index 338562e..2e2a4bc 100644 --- a/src/mcp/tools/search.ts +++ b/src/mcp/tools/search.ts @@ -69,7 +69,16 @@ export function registerSearchTool( server: McpServer, embeddingClient: EmbeddingProvider, toolConfig: SearchToolConfig, - options?: { onToolCall?: () => void }, + options?: { + onToolCall?: () => void; + // Per-session accessors resolved at call time (the MCP session id isn't + // known until the transport connects). getSessionId persists a real + // session_id on each query_log row; getRequestSource persists the + // X-Pathfinder-Source origin tag. Both optional so older callers/tests + // keep working — the analytics writer defaults a missing source to 'user'. + getSessionId?: () => string | undefined; + getRequestSource?: () => string | undefined; + }, ): void { const inputSchema = { query: z.string().describe("The search query"), @@ -166,7 +175,8 @@ export function registerSearchTool( top_score: topScore, latency_ms: latencyMs, source_name: toolConfig.source, - session_id: null, + session_id: options?.getSessionId?.() ?? null, + request_source: options?.getRequestSource?.() ?? null, }, logQueries, ).catch((err) => { diff --git a/src/server.ts b/src/server.ts index 5c3149b..70a39ed 100644 --- a/src/server.ts +++ b/src/server.ts @@ -76,8 +76,11 @@ import { getTopQueries, getEmptyQueries, getToolCounts, + normalizeRequestSource, + REQUEST_SOURCE_HEADER, + REQUEST_SOURCE_VALUES, } from "./db/analytics.js"; -import type { AnalyticsFilter } from "./db/analytics.js"; +import type { AnalyticsFilter, RequestSource } from "./db/analytics.js"; import { approveAtlasSeedEntry, listPendingAtlasSeedCandidates, @@ -1023,23 +1026,17 @@ export function write429RateLimited( /** * Post-accept handler for the /mcp `onsessioninitialized` callback. Extracted - * from the callback so we can (a) wrap the workspaceManager.ensureSession - * call in a try/catch that ROLLS BACK the ipLimiter counter and tears down - * transport state if ensureSession throws, and (b) test that rollback in - * isolation without driving a full SDK lifecycle. + * from the callback so the "new session accepted" side effects (the connect + * log line + the pathfinder.session.created telemetry emit) can be unit-tested + * in isolation without driving a full SDK lifecycle. * - * On ensureSession failure (ENOSPC, EACCES, corrupted workspace, DB error): - * - Log the failure with sid-prefix + ip so operators can diagnose. - * - Call ipLimiter.remove(sid) so the pre-increment doesn't leak and - * permanently count against this IP until TTL reap. - * - Delete transports[sid] / sessionLastActivity[sid] inline (same - * mid-init teardown reasoning as handleSessionInitRaceFallback). - * - Fire-and-forget transport.close() with Promise-wrapped error handling - * so async rejections land in console.error rather than - * unhandledRejection. - * - Do NOT emit a JSON-RPC error frame — the MCP SDK lifecycle constraint - * documented on handleSessionInitRaceFallback applies identically here - * (`transport.send()` would throw "Not connected"). + * Workspaces are allocated LAZILY — this handler does NOT call + * `workspaceManager.ensureSession`; bash tool handlers allocate per-operation + * instead. There is therefore no ensureSession failure path and no rollback: + * the function unconditionally returns `true` (the caller treats `false` as + * "rejected", which this handler never produces). The transport/ipLimiter + * teardown for a stranded session lives on the caller's `transport.onclose` + * wiring and {@link rollbackSessionAfterConnectFailure}, not here. * * Exported for tests; production callers wire this up via the POST /mcp * onsessioninitialized callback. @@ -1051,25 +1048,26 @@ export function handleSessionInitAccept(opts: { transports: Record; sessionLastActivity: Record; ipLimiter?: { remove: (sid: string) => void }; + /** + * Accepted for call-site compatibility but no longer used by this handler: + * workspaces are allocated lazily by the bash tool handlers, not eagerly + * here. Retained so the production caller's options object (which wires the + * module-scope managers) doesn't need to change. + */ workspaceManager?: { ensureSession: (sid: string) => void }; /** - * Session state manager (bash tool per-session shell state). Optional so - * existing test fixtures keep working; the production caller always passes - * the module-scope instance. Rollback clears it alongside the ipLimiter - * counter so a subsequent ensureSession throw-path doesn't leak shell - * state if the accept handler ever starts registering state before the - * ensureSession call. + * Accepted for call-site compatibility but no longer used by this handler. + * It was previously cleared on the ensureSession rollback path, which no + * longer exists (lazy allocation). Per-session shell-state teardown happens + * on the caller's transport.onclose. Retained so existing call sites keep + * compiling. */ sessionStateManager?: { cleanup: (sid: string) => void }; /** - * Express response for the original /mcp init request. When provided AND - * headers haven't been sent yet, rollback writes a structured 503 body so - * the client sees a diagnostic error instead of a silent transport - * teardown. When res.headersSent is already true we can't write (the SDK - * may have begun streaming before the ensureSession throw — unlikely given - * the order — so we fall back to just closing the transport). Optional to - * preserve existing unit test call sites that don't care about the - * response shape. + * Accepted for call-site compatibility but no longer used by this handler. + * It previously carried the rollback 503 body; with no ensureSession + * failure path there is nothing to write here. Retained so existing call + * sites keep compiling. */ res?: { headersSent: boolean; @@ -1104,11 +1102,10 @@ export function handleSessionInitAccept(opts: { `[mcp] New session ${sid.slice(0, 8)} (${Object.keys(tMap).length} active) [${ip}]`, ); - // Telemetry — fire only after ensureSession succeeded so we don't - // emit on rolled-back sessions. Mirrors the SSE handler's after-accept - // gate. The client's own no-op handles disabled/unconfigured cases; - // isEnabled() avoids constructing the property bag on every connect - // when telemetry is off. + // Telemetry — emitted on every accepted session. Mirrors the SSE + // handler's after-accept gate. The client's own no-op handles + // disabled/unconfigured cases; isEnabled() avoids constructing the + // property bag on every connect when telemetry is off. if (p2pTelemetry?.isEnabled()) { p2pTelemetry.emit("pathfinder.session.created", { client_ip: ip, @@ -1491,6 +1488,26 @@ export async function handleExistingSessionRequest(opts: { opts.sessionLastActivity[opts.sid] = now(); } +/** + * Read and normalize the request-origin tag from the X-Pathfinder-Source + * header. Captured ONCE at MCP-session init and closed over for the lifetime + * of the session (each session gets its own server + transport), so every + * tool call within that session records the origin its client declared. + * + * Express collapses duplicate headers to a comma-joined string and lower-cases + * the name; we hand whatever's present to normalizeRequestSource, which maps + * absent/unknown values to the default ('user'). An array-shaped value (only + * possible for set-cookie under Express) is defensively ignored. + * + * Exported for tests so the header→source mapping is verified without spinning + * up the full Express app. + */ +export function requestSourceFromHeaders(req: Request): RequestSource { + const raw = req.headers[REQUEST_SOURCE_HEADER]; + const value = Array.isArray(raw) ? raw[0] : raw; + return normalizeRequestSource(value); +} + app.post("/mcp", bearerMiddleware, async (req: Request, res: Response) => { try { const sessionId = req.headers["mcp-session-id"] as string | undefined; @@ -1795,6 +1812,11 @@ app.post("/mcp", bearerMiddleware, async (req: Request, res: Response) => { : ` (${total} active)`; console.log(`[mcp] Session ${sid.slice(0, 8)} closed${capInfo}`); }; + // Capture the request-origin tag from the init request. Each session + // gets its own server instance, so closing over this constant tags every + // subsequent tool call in the session with the origin the client + // declared on initialize. + const requestSource = requestSourceFromHeaders(req); const server = createMcpServer( bashInstances, sessionStateManager, @@ -1807,6 +1829,7 @@ app.post("/mcp", bearerMiddleware, async (req: Request, res: Response) => { if (sid) sessionHasBeenUsed[sid] = true; }, }, + () => requestSource, ); // Z-1: server.connect(transport) can throw AFTER handleSessionInitAccept // committed maps + ipLimiter counter + ensureSession + onclose wiring. @@ -1963,8 +1986,14 @@ const sseHandlers = createSseHandlers({ getTotalSessionCount: () => getTotalSessionCount(transports, sseTransports), getMaxSessions: () => MAX_SESSIONS, sessionHasBeenUsed, - createMcpServer: () => { + createMcpServer: (req?: Request) => { let transportRef: SSEServerTransport | undefined; + // Capture the request-origin tag from the /sse init request (when the + // handler provides it) so every tool call in this SSE session records the + // declared origin. Defaults to 'user' when the header/req is absent. + const requestSource = req + ? requestSourceFromHeaders(req) + : normalizeRequestSource(undefined); // The handler creates the transport first, then calls createMcpServer() // and connect(transport). We need the sessionId late-bound so bash tools // can discover it via getSessionId(). @@ -1980,6 +2009,7 @@ const sseHandlers = createSseHandlers({ if (sid) sessionHasBeenUsed[sid] = true; }, }, + () => requestSource, ); // Intercept connect() so we can capture the transport reference for // the getSessionId closure above. @@ -2689,7 +2719,15 @@ export function parseAnalyticsFilter(req: Request): AnalyticsFilterParseResult { } return undefined; }; - for (const name of ["from", "to", "tool_type", "source", "days", "limit"]) { + for (const name of [ + "from", + "to", + "tool_type", + "source", + "request_source", + "days", + "limit", + ]) { const err = rejectArray(name); if (err) return err; } @@ -2728,6 +2766,27 @@ export function parseAnalyticsFilter(req: Request): AnalyticsFilterParseResult { filter.source = req.query.source; } + // request_source selects the analytics audience. Omitting it keeps the + // default (real users only — see AnalyticsFilter). Accepted values are the + // canonical origins plus the literal "all" for the unfiltered view. An + // unrecognized value is a client bug; reject with 400 rather than silently + // falling back so a typo doesn't quietly skew which audience the dashboard + // shows. + if (typeof req.query.request_source === "string") { + const allowed = [...REQUEST_SOURCE_VALUES, "all"]; + if (!allowed.includes(req.query.request_source)) { + return { + ok: false, + status: 400, + body: { + error: "invalid_request", + error_description: `request_source must be one of ${allowed.join(", ")}`, + }, + }; + } + filter.request_source = req.query.request_source as RequestSource | "all"; + } + const fromRaw = typeof req.query.from === "string" ? req.query.from : undefined; const toRaw = typeof req.query.to === "string" ? req.query.to : undefined; @@ -2853,10 +2912,14 @@ export function parsePositiveIntParam( return n; } -// Upper bound for the `days` query parameter. Kept at 100000 so the UI's -// "All time" preset (which sends days=ALL_TIME_DAYS=99999 — see -// docs/analytics.html) is comfortably under the cap. If you lower MAX_DAYS, -// make sure it stays >= 99999 or the "All time" preset will 400. +// Upper bound for the `days` query parameter. Kept above the all-time +// sentinel so the UI's "All time" preset (which sends days=ALL_TIME_DAYS — +// the canonical value, 99999, lives in db/analytics.ts and is mirrored by +// docs/analytics.html ALL_TIME_DAYS) is admitted, not rejected. A request at +// or above ALL_TIME_DAYS is treated by the DB layer as "no lower time bound" +// (see buildDateWindow), so the totals span every row. If you lower MAX_DAYS, +// keep it strictly above ALL_TIME_DAYS (99999) or the "All time" preset will +// 400. // // Exported so tests can reference the constant directly instead of // hardcoding the numeric literal (keeps one source of truth). diff --git a/src/sse-handlers.ts b/src/sse-handlers.ts index 305cf31..4fa2536 100644 --- a/src/sse-handlers.ts +++ b/src/sse-handlers.ts @@ -66,7 +66,13 @@ export interface SseHandlerDeps { | IpSessionLimiter | undefined | (() => IpSessionLimiter | undefined); - createMcpServer: () => McpServer; + /** + * Factory for a per-session MCP server. Receives the originating GET /sse + * request so the factory can read per-session request context (e.g. the + * X-Pathfinder-Source origin tag) off `req.headers`. The param is optional + * to stay backward-compatible with callers/tests that ignore the request. + */ + createMcpServer: (req?: Request) => McpServer; /** * Workspace manager. Accepts either a direct instance or a getter to support * late binding in server.ts. Uses the structural `WorkspaceManagerLike` @@ -346,8 +352,9 @@ export function createSseHandlers(deps: SseHandlerDeps): { // Attach a per-session MCP server. createMcpServer().connect() calls // transport.start() internally which writes SSE headers + the - // "endpoint" event to the response stream. - const server = createMcpServer(); + // "endpoint" event to the response stream. Pass `req` so the factory can + // read the X-Pathfinder-Source origin tag off the init request headers. + const server = createMcpServer(req); await server.connect(transport); console.log(