Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions services/platform/convex/_generated/api.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ import type * as agent_tools_types from "../agent_tools/types.js";
import type * as agent_tools_web_helpers_browser_operate from "../agent_tools/web/helpers/browser_operate.js";
import type * as agent_tools_web_helpers_fetch_and_extract from "../agent_tools/web/helpers/fetch_and_extract.js";
import type * as agent_tools_web_helpers_format_web_results from "../agent_tools/web/helpers/format_web_results.js";
import type * as agent_tools_web_helpers_format_website_summaries from "../agent_tools/web/helpers/format_website_summaries.js";
import type * as agent_tools_web_helpers_get_crawler_service_url from "../agent_tools/web/helpers/get_crawler_service_url.js";
import type * as agent_tools_web_helpers_get_operator_service_url from "../agent_tools/web/helpers/get_operator_service_url.js";
import type * as agent_tools_web_helpers_query_web_context from "../agent_tools/web/helpers/query_web_context.js";
Expand Down Expand Up @@ -1044,6 +1045,7 @@ declare const fullApi: ApiFromModules<{
"agent_tools/web/helpers/browser_operate": typeof agent_tools_web_helpers_browser_operate;
"agent_tools/web/helpers/fetch_and_extract": typeof agent_tools_web_helpers_fetch_and_extract;
"agent_tools/web/helpers/format_web_results": typeof agent_tools_web_helpers_format_web_results;
"agent_tools/web/helpers/format_website_summaries": typeof agent_tools_web_helpers_format_website_summaries;
"agent_tools/web/helpers/get_crawler_service_url": typeof agent_tools_web_helpers_get_crawler_service_url;
"agent_tools/web/helpers/get_operator_service_url": typeof agent_tools_web_helpers_get_operator_service_url;
"agent_tools/web/helpers/query_web_context": typeof agent_tools_web_helpers_query_web_context;
Expand Down
4 changes: 2 additions & 2 deletions services/platform/convex/agent_tools/rag/query_rag_context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import {

const debugLog = createDebugLog('DEBUG_RAG_QUERY', '[RAGQuery]');
const DEFAULT_TOP_K = 10;
const DEFAULT_SIMILARITY_THRESHOLD = 0.4;
const DEFAULT_SIMILARITY_THRESHOLD = 0.51;
const RAG_REQUEST_TIMEOUT_MS = 10000; // 10 seconds

// Query expansion constants
Expand Down Expand Up @@ -138,7 +138,7 @@ export interface RagContextOptions {
*
* @param userMessage - The user's message to search for relevant context
* @param topK - Number of results to return (default: 5)
* @param similarityThreshold - Minimum similarity score (default: 0.3)
* @param similarityThreshold - Minimum similarity score (default: 0.51)
* @param signal - Optional AbortSignal for timeout control
* @param recentMessages - Optional recent conversation messages for context expansion
* @param options - Optional multi-tenant options (userId, datasets)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export interface AgentKnowledgeCtx extends ToolCtx {
const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]');

const DEFAULT_TOP_K = 10;
const DEFAULT_SIMILARITY_THRESHOLD = 0.4;
const DEFAULT_SIMILARITY_THRESHOLD = 0.51;

export async function resolveFileIds(
ctx: ToolCtx,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* Format available website summaries for display in no-results messages.
*
* Queries the organization's indexed websites and returns a formatted
* bullet list, or undefined if no websites are configured.
*/

import type { ToolCtx } from '@convex-dev/agent';

import { internal } from '../../../_generated/api';

const MAX_LISTED_WEBSITES = 15;

/**
* Query and format website summaries for the given organization.
* Returns a formatted string like:
* - docs.convex.dev — Convex documentation (245 pages)
* - example.com (18 pages)
*
* Returns undefined if no websites are configured.
*/
export async function formatWebsiteSummaries(
ctx: ToolCtx,
organizationId: string,
): Promise<string | undefined> {
const websites = await ctx.runQuery(
internal.websites.internal_queries.listWebsiteSummaries,
{ organizationId },
);

if (!websites || websites.length === 0) return undefined;

const listed = websites.slice(0, MAX_LISTED_WEBSITES);
const lines = listed.map((w) => {
const parts = [w.domain];
if (w.title || w.description) {
parts.push(` — ${w.title ?? w.description}`);
}
if (w.pageCount != null) {
parts.push(` (${w.pageCount} pages)`);
}
return `- ${parts.join('')}`;
});

if (websites.length > MAX_LISTED_WEBSITES) {
lines.push(`- ... and ${websites.length - MAX_LISTED_WEBSITES} more`);
}

return lines.join('\n');
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { getCrawlerServiceUrl } from './get_crawler_service_url';
const debugLog = createDebugLog('DEBUG_WEB_CONTEXT', '[WebContext]');

const DEFAULT_LIMIT = 10;
const DEFAULT_SIMILARITY_THRESHOLD = 0.4;
const DEFAULT_SIMILARITY_THRESHOLD = 0.51;
const WEB_CONTEXT_TIMEOUT_MS = 10_000;

interface SearchResult {
Expand Down
25 changes: 22 additions & 3 deletions services/platform/convex/agent_tools/web/helpers/search_pages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ import type { ToolCtx } from '@convex-dev/agent';
import { internal } from '../../../_generated/api';
import { createDebugLog } from '../../../lib/debug_log';
import { formatWebResults } from './format_web_results';
import { formatWebsiteSummaries } from './format_website_summaries';
import { getCrawlerServiceUrl } from './get_crawler_service_url';

const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]');

const DEFAULT_LIMIT = 10;
const DEFAULT_SIMILARITY_THRESHOLD = 0.4;
const DEFAULT_SIMILARITY_THRESHOLD = 0.51;

const DOMAIN_PATTERN = /^[a-zA-Z0-9]([a-zA-Z0-9-]*\.)*[a-zA-Z0-9-]+(:\d+)?$/;

Expand Down Expand Up @@ -128,8 +129,20 @@ export async function searchPages(

if (!results || results.length === 0) {
debugLog('web:search_pages no results', { query: args.query });

const summaryText = ctx.organizationId
? await formatWebsiteSummaries(ctx, ctx.organizationId)
: undefined;

if (summaryText) {
return {
text: `No matching pages found for your query.\n\nThe search only covers websites added to your knowledge base. Currently indexed websites:\n${summaryText}\n\nYou can try rephrasing your query, specifying a domain filter, or use fetch mode with a specific URL to access any public webpage directly.`,
citations: [],
};
}

return {
text: 'No matching website pages found for your query. Try rephrasing, or suggest the user add the relevant website to their knowledge base.',
text: 'No matching pages found. There are no websites currently in the knowledge base. To search website content, websites need to be added via the knowledge base settings first. Alternatively, use fetch mode with a specific URL to access any public webpage directly.',
citations: [],
};
}
Expand Down Expand Up @@ -171,8 +184,14 @@ export async function searchPages(
}));

if (domainFallback) {
const summaryText = ctx.organizationId
? await formatWebsiteSummaries(ctx, ctx.organizationId)
: undefined;
const availableNote = summaryText
? `\n\nAvailable websites in the knowledge base:\n${summaryText}`
: '';
return {
text: `No results found on ${validDomain}. Showing results from all indexed websites:\n\n${output}`,
text: `No results found on ${validDomain}.${availableNote}\n\nShowing results from all indexed websites:\n\n${output}`,
citations,
};
}
Expand Down
108 changes: 63 additions & 45 deletions services/platform/convex/agent_tools/web/web_tool.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
/**
* Convex Tool: Web
*
* Two modes:
* 1. **URL fetch**: when the user provides a specific URL, fetch and extract
* Two modes (discriminated union on `mode`):
* 1. **fetch**: when the user provides a specific URL, fetch and extract
* its content directly (web pages, PDFs, images, DOCX, PPTX, etc.).
* 2. **Semantic search**: when the user asks a question without a URL,
* search crawled website pages via vector embeddings.
* Works with any public URL.
* 2. **search**: search crawled website pages via semantic similarity.
* Only covers websites added to the organization's knowledge base.
*/

import { createTool, type ToolCtx } from '@convex-dev/agent';
Expand All @@ -15,14 +16,8 @@ import type { ToolDefinition } from '../types';
import { fetchAndExtract } from './helpers/fetch_and_extract';
import { searchPages } from './helpers/search_pages';

const URL_REGEX = /https?:\/\/[^\s"'<>]+/i;
const FILE_EXTENSIONS = /\.(pdf|docx|pptx|png|jpe?g|gif|webp|bmp|tiff?|svg)$/i;

function extractUrl(text: string): string | null {
const match = text.match(URL_REGEX);
return match ? match[0] : null;
}

function isFileUrl(url: string): boolean {
try {
const path = new URL(url).pathname;
Expand All @@ -32,54 +27,64 @@ function isFileUrl(url: string): boolean {
}
}

const webToolArgs = z.object({
query: z
.string()
.describe(
'The user request or question. Used as extraction instruction when fetching a URL, or as a semantic search query over crawled pages.',
),
url: z
.string()
.optional()
.describe(
'Explicit URL to fetch and extract content from. When provided, the tool fetches and extracts the URL content directly instead of searching.',
),
domain: z
.string()
.optional()
.describe(
'Optional domain to restrict search to (e.g., "docs.convex.dev"). Only applies in search mode, ignored when fetching a URL.',
),
});
const webToolArgs = z.discriminatedUnion('mode', [
z.object({
mode: z
.literal('fetch')
.describe(
'Fetch and extract content from a specific URL. Works with any public URL.',
),
url: z
.string()
.describe(
'The URL to fetch (web page, PDF, DOCX, PPTX, or image such as PNG, JPG, GIF, WebP, etc.)',
),
query: z
.string()
.optional()
.describe(
'Optional extraction instruction to guide what content to focus on.',
),
}),
z.object({
mode: z
.literal('search')
.describe(
'Search through websites added to the knowledge base using semantic similarity.',
),
query: z.string().describe('The search query.'),
domain: z
.string()
.optional()
.describe(
'Optional domain to restrict search to (e.g., "docs.convex.dev").',
),
}),
]);

export const webTool: ToolDefinition = {
name: 'web',
tool: createTool({
description: `Search crawled website pages or fetch content from a specific URL.
description: `Access web content in two modes:

**Mode 1 — Fetch URL**: When the user provides a specific URL (via the \`url\` parameter, or a URL detected in \`query\`), fetch and extract its content directly. Supports web pages, PDFs, DOCX, PPTX, and images (PNG, JPG, GIF, WebP, etc.). The \`query\` is used as the extraction instruction to guide what content to focus on.
**fetch**: Fetch and extract content from any public URL. Supports web pages, PDFs, DOCX, PPTX, and images (PNG, JPG, GIF, WebP, etc.). Use the \`query\` parameter as an extraction instruction to guide what content to focus on.

**Mode 2 — Search**: When no URL is provided, search through previously crawled and indexed website content using semantic similarity. Returns ranked results with page URL, title, and relevant content excerpts.
**search**: Search through websites that have been added to the organization's knowledge base. Only content from indexed knowledge base websites is searchable — this does NOT search the open internet. If the website you need isn't indexed, use fetch mode with a direct URL instead, or suggest the user add the website to their knowledge base.

IMPORTANT: Always cite the source URL for every piece of information you present from the results.

EXAMPLES:
- { url: "https://example.com/report.pdf", query: "Summarize the key findings" }
- { url: "https://example.com/pricing", query: "Extract all pricing tiers" }
- { query: "https://example.com/page" } — URL detected in query, fetches directly
- { query: "shipping policy" } — no URL, searches crawled pages
- { query: "product pricing details" }
- { query: "workflow patterns", domain: "docs.convex.dev" } — searches only docs.convex.dev`,
- { mode: "fetch", url: "https://example.com/report.pdf", query: "Summarize the key findings" }
- { mode: "fetch", url: "https://example.com/pricing" }
- { mode: "search", query: "shipping policy" }
- { mode: "search", query: "workflow patterns", domain: "docs.convex.dev" }`,
inputSchema: webToolArgs,
execute: async (ctx: ToolCtx, args) => {
const targetUrl = args.url || extractUrl(args.query);

if (targetUrl) {
const instruction =
args.url && isFileUrl(targetUrl) ? args.query : undefined;
if (args.mode === 'fetch') {
const instruction = isFileUrl(args.url) ? args.query : undefined;

const result = await fetchAndExtract(ctx, {
url: targetUrl,
url: args.url,
instruction,
});
Comment on lines +83 to 89

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don’t drop fetch instructions for normal web pages.

query is documented as the extraction instruction for fetch mode, but Line 84 only forwards it when the URL looks like a file. A call like { mode: "fetch", url: "https://example.com/pricing", query: "extract the enterprise limits" } will ignore the instruction and return an unguided extraction. Pass args.query through for all fetches, or narrow the fetch-mode contract to file-only instructions.

🔧 Proposed fix
-      if (args.mode === 'fetch') {
-        const instruction = isFileUrl(args.url) ? args.query : undefined;
+      if (args.mode === 'fetch') {
+        const instruction = args.query;
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@services/platform/convex/agent_tools/web/web_tool.ts` around lines 83 - 89,
The code drops args.query for non-file URLs by only setting instruction when
isFileUrl(args.url) is true; update the fetch branch so fetchAndExtract always
receives the extraction instruction (pass instruction: args.query or undefined)
instead of conditionalizing on isFileUrl, or if you intend file-only behavior,
explicitly enforce/validate that in the fetch-mode contract; change the call
site in the fetch branch that invokes fetchAndExtract to pass args.query
(referencing args.mode, isFileUrl, fetchAndExtract, and args.query).


Expand All @@ -101,11 +106,23 @@ EXAMPLES:
.filter(Boolean)
.join(' | ');

const responseText = `${meta}\n\n${result.content}`;
const citationHeader = `[1] (Relevance: 100.0%) [Source: ${result.title ?? result.url}] [URL: ${result.url}]`;
const responseText = `${citationHeader}\n${meta}\n\n${result.content}`;

const citations = [
{
index: 1,
type: 'web' as const,
source: result.title ?? result.url,
url: result.url,
relevance: 1,
},
];
Comment on lines +112 to +120

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's look at the file and understand the context
cat -n services/platform/convex/agent_tools/web/web_tool.ts | head -150

Repository: tale-project/tale

Length of output: 5793


🏁 Script executed:

# Check if there are type definitions for citation objects
rg -A 5 -B 5 "citations" services/platform/convex/agent_tools/web/web_tool.ts

Repository: tale-project/tale

Length of output: 962


🏁 Script executed:

# Search for type definitions related to citations in the codebase
fd -t f "\.ts$" -path "*/agent_tools/*" -o -path "*/types/*" | xargs rg -l "citation" -i 2>/dev/null | head -20

Repository: tale-project/tale

Length of output: 1398


🏁 Script executed:

# Look for ToolDefinition type to understand expected return structure
rg -A 10 "type ToolDefinition" services/platform/convex/agent_tools/

Repository: tale-project/tale

Length of output: 43


🏁 Script executed:

# Check the searchPages function to see what citation type it returns
cat -n services/platform/convex/agent_tools/web/helpers/search_pages.ts | head -80

Repository: tale-project/tale

Length of output: 2565


🏁 Script executed:

# Look for citation type definitions
cat -n services/platform/convex/openai_compat/citations.ts

Repository: tale-project/tale

Length of output: 5149


🏁 Script executed:

# Verify the return type of the tool to ensure explicit annotation will work
rg -B 5 -A 15 "createTool" services/platform/convex/agent_tools/web/web_tool.ts | head -40

Repository: tale-project/tale

Length of output: 1768


🏁 Script executed:

# Check if Citation interface is exported from search_pages.ts
rg "export.*Citation" services/platform/convex/agent_tools/web/helpers/search_pages.ts

Repository: tale-project/tale

Length of output: 43


🏁 Script executed:

# Check the full exports and structure of search_pages.ts
tail -50 services/platform/convex/agent_tools/web/helpers/search_pages.ts

Repository: tale-project/tale

Length of output: 1499


🏁 Script executed:

# Check if Citation is defined as interface in search_pages.ts and whether it's exported
grep -n "interface Citation\|export.*Citation" services/platform/convex/agent_tools/web/helpers/search_pages.ts

Repository: tale-project/tale

Length of output: 84


Use explicit type annotation for the citations array instead of adding as const.

The citations array should be explicitly typed to avoid the type casting. This aligns with the coding guideline to not use type casting (as) in TypeScript code.

♻️ Proposed refactor
-        const citations = [
-          {
-            index: 1,
-            type: 'web' as const,
-            source: result.title ?? result.url,
-            url: result.url,
-            relevance: 1,
-          },
-        ];
+        const citations: Array<{
+          index: number;
+          type: 'web';
+          source: string;
+          url: string;
+          relevance: number;
+        }> = [
+          {
+            index: 1,
+            type: 'web',
+            source: result.title ?? result.url,
+            url: result.url,
+            relevance: 1,
+          },
+        ];
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const citations = [
{
index: 1,
type: 'web' as const,
source: result.title ?? result.url,
url: result.url,
relevance: 1,
},
];
const citations: Array<{
index: number;
type: 'web';
source: string;
url: string;
relevance: number;
}> = [
{
index: 1,
type: 'web',
source: result.title ?? result.url,
url: result.url,
relevance: 1,
},
];
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@services/platform/convex/agent_tools/web/web_tool.ts` around lines 111 - 119,
Replace the inline type cast "as const" by giving the citations array an
explicit type annotation (e.g. const citations: WebCitation[] or Citation[]
depending on your domain types) and construct the object with matching property
types from result (index, type: 'web', source, url, relevance). Update the
declaration for the variable named citations in web_tool.ts and ensure the
chosen type (WebCitation/Citation) defines type: 'web' as a literal union so no
casting is required; remove the "as const" from the object literal.


return {
success: true,
response: responseText,
citations,
...(result.usage && {
usage: {
inputTokens: result.usage.input_tokens,
Expand All @@ -117,6 +134,7 @@ EXAMPLES:
};
}

// mode === 'search'
const { text: searchResult, citations } = await searchPages(ctx, {
query: args.query,
domain: args.domain,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,7 @@ export async function generateAgentResponse(
maxHistoryTokens: effectiveMaxHistoryTokens,
ragContext: knowledgeContextResult?.text ?? hookData?.ragContext,
webContext: webContextResult?.text,
promptMessageId,
});
const contextBuildMs = Date.now() - contextBuildStart;

Expand Down Expand Up @@ -981,6 +982,7 @@ export async function generateAgentResponse(
parentThreadId,
maxHistoryTokens: effectiveMaxHistoryTokens,
ragContext: hookData?.ragContext,
promptMessageId,
});

const continueAgent = createAgent(agentOptions);
Expand Down Expand Up @@ -1211,6 +1213,7 @@ export async function generateAgentResponse(
parentThreadId,
maxHistoryTokens: effectiveMaxHistoryTokens,
ragContext: hookData?.ragContext,
promptMessageId,
});

const recoveryAgent = createAgent(agentOptions);
Expand Down
Loading