diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts index 9b19d10d03..055fbb2d24 100644 --- a/services/platform/convex/_generated/api.d.ts +++ b/services/platform/convex/_generated/api.d.ts @@ -98,6 +98,7 @@ import type * as agent_tools_types from "../agent_tools/types.js"; import type * as agent_tools_web_helpers_browser_operate from "../agent_tools/web/helpers/browser_operate.js"; import type * as agent_tools_web_helpers_fetch_and_extract from "../agent_tools/web/helpers/fetch_and_extract.js"; import type * as agent_tools_web_helpers_format_web_results from "../agent_tools/web/helpers/format_web_results.js"; +import type * as agent_tools_web_helpers_format_website_summaries from "../agent_tools/web/helpers/format_website_summaries.js"; import type * as agent_tools_web_helpers_get_crawler_service_url from "../agent_tools/web/helpers/get_crawler_service_url.js"; import type * as agent_tools_web_helpers_get_operator_service_url from "../agent_tools/web/helpers/get_operator_service_url.js"; import type * as agent_tools_web_helpers_query_web_context from "../agent_tools/web/helpers/query_web_context.js"; @@ -1044,6 +1045,7 @@ declare const fullApi: ApiFromModules<{ "agent_tools/web/helpers/browser_operate": typeof agent_tools_web_helpers_browser_operate; "agent_tools/web/helpers/fetch_and_extract": typeof agent_tools_web_helpers_fetch_and_extract; "agent_tools/web/helpers/format_web_results": typeof agent_tools_web_helpers_format_web_results; + "agent_tools/web/helpers/format_website_summaries": typeof agent_tools_web_helpers_format_website_summaries; "agent_tools/web/helpers/get_crawler_service_url": typeof agent_tools_web_helpers_get_crawler_service_url; "agent_tools/web/helpers/get_operator_service_url": typeof agent_tools_web_helpers_get_operator_service_url; "agent_tools/web/helpers/query_web_context": typeof agent_tools_web_helpers_query_web_context; diff --git a/services/platform/convex/agent_tools/rag/query_rag_context.ts b/services/platform/convex/agent_tools/rag/query_rag_context.ts index 55c6ed9e5e..3e185504c7 100644 --- a/services/platform/convex/agent_tools/rag/query_rag_context.ts +++ b/services/platform/convex/agent_tools/rag/query_rag_context.ts @@ -21,7 +21,7 @@ import { const debugLog = createDebugLog('DEBUG_RAG_QUERY', '[RAGQuery]'); const DEFAULT_TOP_K = 10; -const DEFAULT_SIMILARITY_THRESHOLD = 0.4; +const DEFAULT_SIMILARITY_THRESHOLD = 0.51; const RAG_REQUEST_TIMEOUT_MS = 10000; // 10 seconds // Query expansion constants @@ -138,7 +138,7 @@ export interface RagContextOptions { * * @param userMessage - The user's message to search for relevant context * @param topK - Number of results to return (default: 5) - * @param similarityThreshold - Minimum similarity score (default: 0.3) + * @param similarityThreshold - Minimum similarity score (default: 0.51) * @param signal - Optional AbortSignal for timeout control * @param recentMessages - Optional recent conversation messages for context expansion * @param options - Optional multi-tenant options (userId, datasets) diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts index 958a96f7c9..32d324a599 100644 --- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts +++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts @@ -41,7 +41,7 @@ export interface AgentKnowledgeCtx extends ToolCtx { const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); const DEFAULT_TOP_K = 10; -const DEFAULT_SIMILARITY_THRESHOLD = 0.4; +const DEFAULT_SIMILARITY_THRESHOLD = 0.51; export async function resolveFileIds( ctx: ToolCtx, diff --git a/services/platform/convex/agent_tools/web/helpers/format_website_summaries.ts b/services/platform/convex/agent_tools/web/helpers/format_website_summaries.ts new file mode 100644 index 0000000000..a433379c68 --- /dev/null +++ b/services/platform/convex/agent_tools/web/helpers/format_website_summaries.ts @@ -0,0 +1,50 @@ +/** + * Format available website summaries for display in no-results messages. + * + * Queries the organization's indexed websites and returns a formatted + * bullet list, or undefined if no websites are configured. + */ + +import type { ToolCtx } from '@convex-dev/agent'; + +import { internal } from '../../../_generated/api'; + +const MAX_LISTED_WEBSITES = 15; + +/** + * Query and format website summaries for the given organization. + * Returns a formatted string like: + * - docs.convex.dev — Convex documentation (245 pages) + * - example.com (18 pages) + * + * Returns undefined if no websites are configured. + */ +export async function formatWebsiteSummaries( + ctx: ToolCtx, + organizationId: string, +): Promise { + const websites = await ctx.runQuery( + internal.websites.internal_queries.listWebsiteSummaries, + { organizationId }, + ); + + if (!websites || websites.length === 0) return undefined; + + const listed = websites.slice(0, MAX_LISTED_WEBSITES); + const lines = listed.map((w) => { + const parts = [w.domain]; + if (w.title || w.description) { + parts.push(` — ${w.title ?? w.description}`); + } + if (w.pageCount != null) { + parts.push(` (${w.pageCount} pages)`); + } + return `- ${parts.join('')}`; + }); + + if (websites.length > MAX_LISTED_WEBSITES) { + lines.push(`- ... and ${websites.length - MAX_LISTED_WEBSITES} more`); + } + + return lines.join('\n'); +} diff --git a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts index 14dba36dd4..85aaff34f5 100644 --- a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts +++ b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts @@ -16,7 +16,7 @@ import { getCrawlerServiceUrl } from './get_crawler_service_url'; const debugLog = createDebugLog('DEBUG_WEB_CONTEXT', '[WebContext]'); const DEFAULT_LIMIT = 10; -const DEFAULT_SIMILARITY_THRESHOLD = 0.4; +const DEFAULT_SIMILARITY_THRESHOLD = 0.51; const WEB_CONTEXT_TIMEOUT_MS = 10_000; interface SearchResult { diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts index 7de539d5a1..085b52801e 100644 --- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts +++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts @@ -10,12 +10,13 @@ import type { ToolCtx } from '@convex-dev/agent'; import { internal } from '../../../_generated/api'; import { createDebugLog } from '../../../lib/debug_log'; import { formatWebResults } from './format_web_results'; +import { formatWebsiteSummaries } from './format_website_summaries'; import { getCrawlerServiceUrl } from './get_crawler_service_url'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); const DEFAULT_LIMIT = 10; -const DEFAULT_SIMILARITY_THRESHOLD = 0.4; +const DEFAULT_SIMILARITY_THRESHOLD = 0.51; const DOMAIN_PATTERN = /^[a-zA-Z0-9]([a-zA-Z0-9-]*\.)*[a-zA-Z0-9-]+(:\d+)?$/; @@ -128,8 +129,20 @@ export async function searchPages( if (!results || results.length === 0) { debugLog('web:search_pages no results', { query: args.query }); + + const summaryText = ctx.organizationId + ? await formatWebsiteSummaries(ctx, ctx.organizationId) + : undefined; + + if (summaryText) { + return { + text: `No matching pages found for your query.\n\nThe search only covers websites added to your knowledge base. Currently indexed websites:\n${summaryText}\n\nYou can try rephrasing your query, specifying a domain filter, or use fetch mode with a specific URL to access any public webpage directly.`, + citations: [], + }; + } + return { - text: 'No matching website pages found for your query. Try rephrasing, or suggest the user add the relevant website to their knowledge base.', + text: 'No matching pages found. There are no websites currently in the knowledge base. To search website content, websites need to be added via the knowledge base settings first. Alternatively, use fetch mode with a specific URL to access any public webpage directly.', citations: [], }; } @@ -171,8 +184,14 @@ export async function searchPages( })); if (domainFallback) { + const summaryText = ctx.organizationId + ? await formatWebsiteSummaries(ctx, ctx.organizationId) + : undefined; + const availableNote = summaryText + ? `\n\nAvailable websites in the knowledge base:\n${summaryText}` + : ''; return { - text: `No results found on ${validDomain}. Showing results from all indexed websites:\n\n${output}`, + text: `No results found on ${validDomain}.${availableNote}\n\nShowing results from all indexed websites:\n\n${output}`, citations, }; } diff --git a/services/platform/convex/agent_tools/web/web_tool.ts b/services/platform/convex/agent_tools/web/web_tool.ts index 23ac6b26b3..4bc378fd74 100644 --- a/services/platform/convex/agent_tools/web/web_tool.ts +++ b/services/platform/convex/agent_tools/web/web_tool.ts @@ -1,11 +1,12 @@ /** * Convex Tool: Web * - * Two modes: - * 1. **URL fetch**: when the user provides a specific URL, fetch and extract + * Two modes (discriminated union on `mode`): + * 1. **fetch**: when the user provides a specific URL, fetch and extract * its content directly (web pages, PDFs, images, DOCX, PPTX, etc.). - * 2. **Semantic search**: when the user asks a question without a URL, - * search crawled website pages via vector embeddings. + * Works with any public URL. + * 2. **search**: search crawled website pages via semantic similarity. + * Only covers websites added to the organization's knowledge base. */ import { createTool, type ToolCtx } from '@convex-dev/agent'; @@ -15,14 +16,8 @@ import type { ToolDefinition } from '../types'; import { fetchAndExtract } from './helpers/fetch_and_extract'; import { searchPages } from './helpers/search_pages'; -const URL_REGEX = /https?:\/\/[^\s"'<>]+/i; const FILE_EXTENSIONS = /\.(pdf|docx|pptx|png|jpe?g|gif|webp|bmp|tiff?|svg)$/i; -function extractUrl(text: string): string | null { - const match = text.match(URL_REGEX); - return match ? match[0] : null; -} - function isFileUrl(url: string): boolean { try { const path = new URL(url).pathname; @@ -32,54 +27,64 @@ function isFileUrl(url: string): boolean { } } -const webToolArgs = z.object({ - query: z - .string() - .describe( - 'The user request or question. Used as extraction instruction when fetching a URL, or as a semantic search query over crawled pages.', - ), - url: z - .string() - .optional() - .describe( - 'Explicit URL to fetch and extract content from. When provided, the tool fetches and extracts the URL content directly instead of searching.', - ), - domain: z - .string() - .optional() - .describe( - 'Optional domain to restrict search to (e.g., "docs.convex.dev"). Only applies in search mode, ignored when fetching a URL.', - ), -}); +const webToolArgs = z.discriminatedUnion('mode', [ + z.object({ + mode: z + .literal('fetch') + .describe( + 'Fetch and extract content from a specific URL. Works with any public URL.', + ), + url: z + .string() + .describe( + 'The URL to fetch (web page, PDF, DOCX, PPTX, or image such as PNG, JPG, GIF, WebP, etc.)', + ), + query: z + .string() + .optional() + .describe( + 'Optional extraction instruction to guide what content to focus on.', + ), + }), + z.object({ + mode: z + .literal('search') + .describe( + 'Search through websites added to the knowledge base using semantic similarity.', + ), + query: z.string().describe('The search query.'), + domain: z + .string() + .optional() + .describe( + 'Optional domain to restrict search to (e.g., "docs.convex.dev").', + ), + }), +]); export const webTool: ToolDefinition = { name: 'web', tool: createTool({ - description: `Search crawled website pages or fetch content from a specific URL. + description: `Access web content in two modes: -**Mode 1 — Fetch URL**: When the user provides a specific URL (via the \`url\` parameter, or a URL detected in \`query\`), fetch and extract its content directly. Supports web pages, PDFs, DOCX, PPTX, and images (PNG, JPG, GIF, WebP, etc.). The \`query\` is used as the extraction instruction to guide what content to focus on. +**fetch**: Fetch and extract content from any public URL. Supports web pages, PDFs, DOCX, PPTX, and images (PNG, JPG, GIF, WebP, etc.). Use the \`query\` parameter as an extraction instruction to guide what content to focus on. -**Mode 2 — Search**: When no URL is provided, search through previously crawled and indexed website content using semantic similarity. Returns ranked results with page URL, title, and relevant content excerpts. +**search**: Search through websites that have been added to the organization's knowledge base. Only content from indexed knowledge base websites is searchable — this does NOT search the open internet. If the website you need isn't indexed, use fetch mode with a direct URL instead, or suggest the user add the website to their knowledge base. IMPORTANT: Always cite the source URL for every piece of information you present from the results. EXAMPLES: -- { url: "https://example.com/report.pdf", query: "Summarize the key findings" } -- { url: "https://example.com/pricing", query: "Extract all pricing tiers" } -- { query: "https://example.com/page" } — URL detected in query, fetches directly -- { query: "shipping policy" } — no URL, searches crawled pages -- { query: "product pricing details" } -- { query: "workflow patterns", domain: "docs.convex.dev" } — searches only docs.convex.dev`, +- { mode: "fetch", url: "https://example.com/report.pdf", query: "Summarize the key findings" } +- { mode: "fetch", url: "https://example.com/pricing" } +- { mode: "search", query: "shipping policy" } +- { mode: "search", query: "workflow patterns", domain: "docs.convex.dev" }`, inputSchema: webToolArgs, execute: async (ctx: ToolCtx, args) => { - const targetUrl = args.url || extractUrl(args.query); - - if (targetUrl) { - const instruction = - args.url && isFileUrl(targetUrl) ? args.query : undefined; + if (args.mode === 'fetch') { + const instruction = isFileUrl(args.url) ? args.query : undefined; const result = await fetchAndExtract(ctx, { - url: targetUrl, + url: args.url, instruction, }); @@ -101,11 +106,23 @@ EXAMPLES: .filter(Boolean) .join(' | '); - const responseText = `${meta}\n\n${result.content}`; + const citationHeader = `[1] (Relevance: 100.0%) [Source: ${result.title ?? result.url}] [URL: ${result.url}]`; + const responseText = `${citationHeader}\n${meta}\n\n${result.content}`; + + const citations = [ + { + index: 1, + type: 'web' as const, + source: result.title ?? result.url, + url: result.url, + relevance: 1, + }, + ]; return { success: true, response: responseText, + citations, ...(result.usage && { usage: { inputTokens: result.usage.input_tokens, @@ -117,6 +134,7 @@ EXAMPLES: }; } + // mode === 'search' const { text: searchResult, citations } = await searchPages(ctx, { query: args.query, domain: args.domain, diff --git a/services/platform/convex/lib/agent_response/generate_response.ts b/services/platform/convex/lib/agent_response/generate_response.ts index 5c5606be8d..3f2f483fba 100644 --- a/services/platform/convex/lib/agent_response/generate_response.ts +++ b/services/platform/convex/lib/agent_response/generate_response.ts @@ -550,6 +550,7 @@ export async function generateAgentResponse( maxHistoryTokens: effectiveMaxHistoryTokens, ragContext: knowledgeContextResult?.text ?? hookData?.ragContext, webContext: webContextResult?.text, + promptMessageId, }); const contextBuildMs = Date.now() - contextBuildStart; @@ -981,6 +982,7 @@ export async function generateAgentResponse( parentThreadId, maxHistoryTokens: effectiveMaxHistoryTokens, ragContext: hookData?.ragContext, + promptMessageId, }); const continueAgent = createAgent(agentOptions); @@ -1211,6 +1213,7 @@ export async function generateAgentResponse( parentThreadId, maxHistoryTokens: effectiveMaxHistoryTokens, ragContext: hookData?.ragContext, + promptMessageId, }); const recoveryAgent = createAgent(agentOptions); diff --git a/services/platform/convex/lib/context_management/structured_context_builder.ts b/services/platform/convex/lib/context_management/structured_context_builder.ts index b5f4ed61d9..1519f48722 100644 --- a/services/platform/convex/lib/context_management/structured_context_builder.ts +++ b/services/platform/convex/lib/context_management/structured_context_builder.ts @@ -109,6 +109,12 @@ export interface BuildStructuredContextParams { additionalContext?: Record; /** Parent thread ID (for sub-agent mode, indicates this is a delegated task) */ parentThreadId?: string; + /** ID of the message being sent as the `prompt` parameter to the LLM. + * When set, only this message is excluded from history (to avoid duplication). + * Without it the builder falls back to skipping the last user message, which + * can drop context when the prompt is actually a system message (e.g. location + * response). */ + promptMessageId?: string; } /** @@ -131,6 +137,7 @@ export async function buildStructuredContext( maxHistoryTokens = DEFAULT_MAX_HISTORY_TOKENS, additionalContext, parentThreadId, + promptMessageId, } = params; // 1. Load message history and approvals in parallel (independent queries) @@ -169,6 +176,7 @@ export async function buildStructuredContext( messages, approvals ?? [], toolMessageAges, + promptMessageId, ); if (historyMessages.length > 0) { contextParts.push(fmt.formatHistorySection(historyMessages.join('\n\n'))); @@ -343,6 +351,7 @@ function formatMessagesWithApprovals( messages: MessageDoc[], approvals: ApprovalItem[], toolMessageAges?: Map, + promptMessageId?: string, ): FormattedMessagesResult { const result: string[] = []; @@ -365,12 +374,19 @@ function formatMessagesWithApprovals( return a.stepOrder - b.stepOrder; }); - // Find the last user message (current request, not history) + // Determine which user message to skip (it's passed via `prompt` parameter, + // not in context). When `promptMessageId` is provided we skip only the exact + // message being used as the prompt — this avoids dropping the original user + // question when the prompt is actually a system message (e.g. location + // response). Without an explicit ID we fall back to the last user message. + const skipMessageId: string | undefined = promptMessageId; let lastUserMsgIndex = -1; - for (let i = sortedMessages.length - 1; i >= 0; i--) { - if (sortedMessages[i].message?.role === 'user') { - lastUserMsgIndex = i; - break; + if (!skipMessageId) { + for (let i = sortedMessages.length - 1; i >= 0; i--) { + if (sortedMessages[i].message?.role === 'user') { + lastUserMsgIndex = i; + break; + } } } @@ -381,11 +397,17 @@ function formatMessagesWithApprovals( if (!message) continue; + // Skip the message being sent as `prompt` (any role) to avoid duplication + if (skipMessageId && msg._id === skipMessageId) continue; + if (message.role === 'user') { const content = extractTextContent(message.content); if (content) { - // Skip the last user message - it's passed via `prompt` parameter, not in context - if (i !== lastUserMsgIndex) { + // Fallback: when no explicit promptMessageId, skip the last user message + // (assumed to be passed via `prompt` parameter) + if (!skipMessageId && i === lastUserMsgIndex) { + // skip — already sent as prompt + } else { result.push(fmt.formatUserMessage(content, timestamp)); } } diff --git a/services/platform/convex/websites/internal_queries.ts b/services/platform/convex/websites/internal_queries.ts index b5f9b87aa5..e28b57fd96 100644 --- a/services/platform/convex/websites/internal_queries.ts +++ b/services/platform/convex/websites/internal_queries.ts @@ -74,3 +74,36 @@ export const getWebsiteByDomain = internalQuery({ return await WebsitesHelpers.getWebsiteByDomain(ctx, args); }, }); + +/** + * Lightweight website summaries for an organization. + * Used by the web tool to list available websites in no-results messages. + */ +export const listWebsiteSummaries = internalQuery({ + args: { + organizationId: v.string(), + }, + handler: async (ctx, args) => { + const results: Array<{ + domain: string; + title?: string; + description?: string; + pageCount?: number; + }> = []; + const excludeStatuses = new Set(['deleting', 'error']); + for await (const website of ctx.db + .query('websites') + .withIndex('by_organizationId', (q) => + q.eq('organizationId', args.organizationId), + )) { + if (website.status && excludeStatuses.has(website.status)) continue; + results.push({ + domain: website.domain, + title: website.title, + description: website.description, + pageCount: website.pageCount, + }); + } + return results; + }, +});