diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/__init__.py b/packages/tale_knowledge/src/tale_knowledge/extraction/__init__.py index ca72963dfd..52065d4405 100644 --- a/packages/tale_knowledge/src/tale_knowledge/extraction/__init__.py +++ b/packages/tale_knowledge/src/tale_knowledge/extraction/__init__.py @@ -1,5 +1,5 @@ """File text extraction modules.""" -from .router import extract_text +from .router import ProgressCallback, extract_text -__all__ = ["extract_text"] +__all__ = ["ProgressCallback", "extract_text"] diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py b/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py index 1bef86ae24..b48da93548 100644 --- a/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py +++ b/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py @@ -8,6 +8,7 @@ from __future__ import annotations import asyncio +from collections.abc import Callable from functools import partial from typing import TYPE_CHECKING @@ -19,6 +20,8 @@ if TYPE_CHECKING: from tale_knowledge.vision.client import VisionClient +ProgressCallback = Callable[[int, int], None] # (pages_done, total_pages) + LARGE_IMAGE_RATIO = 0.5 MAX_PAGES = 2000 DEFAULT_PAGE_CONCURRENCY = 8 @@ -139,6 +142,7 @@ async def extract_text_from_pdf_bytes( vision_client: VisionClient | None = None, process_images: bool = True, max_pages: int = MAX_PAGES, + on_progress: ProgressCallback | None = None, ) -> tuple[str, bool]: """Extract text from PDF bytes. @@ -148,6 +152,8 @@ async def extract_text_from_pdf_bytes( vision_client: Optional VisionClient for OCR/image description. process_images: Whether to extract and describe embedded images. max_pages: Maximum number of pages to process. + on_progress: Optional callback ``(pages_done, total_pages)`` invoked + after each page completes. Safe to call from concurrent tasks. Returns: Tuple of (extracted_text, vision_was_used). @@ -182,7 +188,10 @@ async def extract_text_from_pdf_bytes( finally: doc.close() + pages_done = 0 + async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool]: + nonlocal pages_done async with page_semaphore: content, vis_used = await _extract_page_with_layout( page_bytes, @@ -191,6 +200,9 @@ async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool vision_client, process_images, ) + pages_done += 1 + if on_progress is not None: + on_progress(pages_done, pages_to_process) return page_num, f"--- Page {page_num + 1} ---\n{content}", vis_used tasks = [process_page(pn, pb) for pn, pb in page_data] diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/router.py b/packages/tale_knowledge/src/tale_knowledge/extraction/router.py index 5aa11d9e62..e16e2cdcd7 100644 --- a/packages/tale_knowledge/src/tale_knowledge/extraction/router.py +++ b/packages/tale_knowledge/src/tale_knowledge/extraction/router.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import Callable from pathlib import Path from typing import TYPE_CHECKING @@ -33,12 +34,16 @@ def is_supported(filename: str) -> bool: return Path(filename).suffix.lower() in ALL_SUPPORTED_EXTENSIONS +ProgressCallback = Callable[[int, int], None] + + async def extract_text( file_bytes: bytes, filename: str, *, vision_client: VisionClient | None = None, process_images: bool = True, + on_progress: ProgressCallback | None = None, ) -> tuple[str, bool]: """Extract text from file bytes, routing to the correct extractor. @@ -47,6 +52,8 @@ async def extract_text( filename: Original filename (used to determine file type). vision_client: Optional VisionClient for OCR/image description. process_images: Whether to extract and describe embedded images. + on_progress: Optional callback ``(done, total)`` for page-level progress + (currently only used by PDF extraction). Returns: Tuple of (extracted_text, vision_was_used). @@ -65,6 +72,7 @@ async def extract_text( filename, vision_client=vision_client, process_images=process_images, + on_progress=on_progress, ) if suffix in DOCX_EXTENSIONS: diff --git a/services/crawler/app/models.py b/services/crawler/app/models.py index 1c81162a2c..9d5a51c31d 100644 --- a/services/crawler/app/models.py +++ b/services/crawler/app/models.py @@ -207,6 +207,18 @@ class HtmlToDocxRequest(BaseModel): # ==================== PPTX Models ==================== +class MarkdownToPptxRequest(BaseModel): + """Request to convert Markdown to PPTX.""" + + content: str = Field(..., description="Markdown content to convert") + + +class HtmlToPptxRequest(BaseModel): + """Request to convert HTML to PPTX.""" + + html: str = Field(..., description="HTML content to convert") + + class TableData(BaseModel): """Table data for PPTX generation.""" diff --git a/services/crawler/app/routers/pptx.py b/services/crawler/app/routers/pptx.py index 9652f85500..9c512a73d0 100644 --- a/services/crawler/app/routers/pptx.py +++ b/services/crawler/app/routers/pptx.py @@ -6,11 +6,14 @@ import json from fastapi import APIRouter, File, Form, HTTPException, UploadFile, status +from fastapi.responses import Response from loguru import logger from app.models import ( FileMetadataResponse, GeneratePptxResponse, + HtmlToPptxRequest, + MarkdownToPptxRequest, ParseFileResponse, ) from app.services.file_parser_service import get_file_parser_service @@ -133,6 +136,88 @@ async def generate_pptx_from_json( ) +_PPTX_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + + +@router.post("/from-markdown") +async def convert_markdown_to_pptx(request: MarkdownToPptxRequest): + """ + Convert Markdown content to PPTX. + + Parses markdown into HTML, then extracts slide structure (headings become + slide titles, lists become bullet points, etc.) and generates a PowerPoint. + + Args: + request: Markdown content + + Returns: + PPTX file as binary response + """ + try: + from app.services.base_converter import BaseConverterService + from app.services.html_to_pptx_converter import html_to_slides + + converter = BaseConverterService() + html = await converter.markdown_to_html(request.content) + slides_content = html_to_slides(html) + + template_service = get_template_service() + pptx_bytes = await template_service.generate_pptx_from_content( + slides_content=slides_content, + ) + + return Response( + content=pptx_bytes, + media_type=_PPTX_CONTENT_TYPE, + headers={"Content-Disposition": "attachment; filename=presentation.pptx"}, + ) + + except Exception: + logger.exception("Error converting markdown to PPTX") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to convert markdown to PPTX", + ) from None + + +@router.post("/from-html") +async def convert_html_to_pptx(request: HtmlToPptxRequest): + """ + Convert HTML content to PPTX. + + Parses HTML to extract slide structure (h1/h2 headings become slide titles, + lists become bullet points, tables preserved) and generates a PowerPoint. + + Args: + request: HTML content + + Returns: + PPTX file as binary response + """ + try: + from app.services.html_to_pptx_converter import html_to_slides + + slides_content = html_to_slides(request.html) + + template_service = get_template_service() + pptx_bytes = await template_service.generate_pptx_from_content( + slides_content=slides_content, + ) + + return Response( + content=pptx_bytes, + media_type=_PPTX_CONTENT_TYPE, + headers={"Content-Disposition": "attachment; filename=presentation.pptx"}, + ) + + except Exception: + logger.exception("Error converting HTML to PPTX") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to convert HTML to PPTX", + ) from None + + @router.post("/parse", response_model=ParseFileResponse) async def parse_pptx_file( file: UploadFile = _FILE_UPLOAD, diff --git a/services/crawler/app/services/html_to_pptx_converter.py b/services/crawler/app/services/html_to_pptx_converter.py new file mode 100644 index 0000000000..3979bbc89f --- /dev/null +++ b/services/crawler/app/services/html_to_pptx_converter.py @@ -0,0 +1,235 @@ +""" +HTML to PPTX slide converter. + +Parses HTML content and converts it into structured slide dicts +that can be passed to PptxService.generate_pptx_from_content(). + +Uses BeautifulSoup for HTML parsing. Each top-level heading (h1/h2) +starts a new slide; content between headings becomes bullet points +or text content on that slide. +""" + +import logging +import re +from typing import Any + +from bs4 import BeautifulSoup, NavigableString, Tag + +logger = logging.getLogger(__name__) + +# Heading tags that start a new slide +_SLIDE_BREAK_TAGS = {"h1", "h2"} + +# Tags to skip entirely +_SKIP_TAGS = {"script", "style", "meta", "link", "head"} + + +def _get_text(element: Tag) -> str: + """Extract clean text from an element, collapsing whitespace.""" + text = element.get_text(separator=" ", strip=True) + return re.sub(r"\s+", " ", text).strip() + + +def _parse_list_items(list_tag: Tag) -> list[str]: + """Extract text from
  • children of a list tag.""" + items: list[str] = [] + for li in list_tag.find_all("li", recursive=False): + text = _get_text(li) + if text: + items.append(text) + return items + + +def _parse_table(table_tag: Tag) -> dict[str, Any] | None: + """Parse an HTML table into headers and rows.""" + headers: list[str] = [] + rows: list[list[str]] = [] + + thead = table_tag.find("thead") + if thead: + for th in thead.find_all("th"): + headers.append(_get_text(th)) + + tbody = table_tag.find("tbody") or table_tag + for tr in tbody.find_all("tr", recursive=False): + cells = tr.find_all(["td", "th"]) + if not cells: + continue + + if not headers and all(cell.name == "th" for cell in cells): + headers = [_get_text(cell) for cell in cells] + continue + + row = [_get_text(cell) for cell in cells] + rows.append(row) + + if not headers and not rows: + return None + + if not headers and rows: + col_count = max(len(r) for r in rows) + headers = [f"Column {i + 1}" for i in range(col_count)] + + for i, row in enumerate(rows): + if len(row) < len(headers): + rows[i] = row + [""] * (len(headers) - len(row)) + elif len(row) > len(headers): + rows[i] = row[: len(headers)] + + return {"headers": headers, "rows": rows} + + +def _flush_slide( + slides: list[dict[str, Any]], + title: str | None, + subtitle: str | None, + text_content: list[str], + bullet_points: list[str], + tables: list[dict[str, Any]], +) -> None: + """Flush accumulated content into a slide dict.""" + if not title and not text_content and not bullet_points and not tables: + return + + slide: dict[str, Any] = {} + if title: + slide["title"] = title + if subtitle: + slide["subtitle"] = subtitle + if text_content: + slide["textContent"] = text_content + if bullet_points: + slide["bulletPoints"] = bullet_points + if tables: + slide["tables"] = tables + + slides.append(slide) + + +def _collect_content( + element: Tag, + text_content: list[str], + bullet_points: list[str], + tables: list[dict[str, Any]], +) -> None: + """Collect content from an element into the appropriate lists.""" + tag_name = element.name.lower() + + if tag_name in _SKIP_TAGS: + return + + # Lists become bullet points + if tag_name in ("ul", "ol"): + items = _parse_list_items(element) + bullet_points.extend(items) + return + + # Tables + if tag_name == "table": + table_data = _parse_table(element) + if table_data: + tables.append(table_data) + return + + # Container tags — recurse into children + if tag_name in ("div", "section", "article", "main", "header", "footer", "nav", "aside"): + for child in element.children: + if isinstance(child, NavigableString): + text = child.strip() + if text: + text_content.append(text) + elif isinstance(child, Tag): + _collect_content(child, text_content, bullet_points, tables) + return + + # Sub-headings (h3-h6) become bold text content within a slide + if tag_name in ("h3", "h4", "h5", "h6"): + text = _get_text(element) + if text: + text_content.append(text) + return + + # Code blocks + if tag_name == "pre": + code_tag = element.find("code") + text = code_tag.get_text() if code_tag else element.get_text() + if text.strip(): + text_content.append(text.strip()) + return + + # Paragraph and everything else with text + text = _get_text(element) + if text: + text_content.append(text) + + +def html_to_slides(html: str) -> list[dict[str, Any]]: + """ + Convert HTML content to a list of slide content dicts for PptxService. + + Each h1/h2 heading starts a new slide. Content between headings + becomes textContent or bulletPoints on that slide. + + Returns: + List of slide dicts with title, subtitle, textContent, bulletPoints, tables. + """ + soup = BeautifulSoup(html, "html.parser") + body = soup.find("body") or soup + + slides: list[dict[str, Any]] = [] + + # Current slide accumulation + current_title: str | None = None + current_subtitle: str | None = None + current_text: list[str] = [] + current_bullets: list[str] = [] + current_tables: list[dict[str, Any]] = [] + + for child in body.children: + if isinstance(child, NavigableString): + text = child.strip() + if text: + current_text.append(text) + continue + + if not isinstance(child, Tag): + continue + + tag_name = child.name.lower() + + if tag_name in _SKIP_TAGS: + continue + + # h1/h2 starts a new slide + if tag_name in _SLIDE_BREAK_TAGS: + # Flush previous slide + _flush_slide(slides, current_title, current_subtitle, current_text, current_bullets, current_tables) + current_title = _get_text(child) + current_subtitle = None + current_text = [] + current_bullets = [] + current_tables = [] + continue + + # h3 right after a title with no content yet becomes subtitle + if tag_name == "h3" and current_title and not current_text and not current_bullets and not current_subtitle: + current_subtitle = _get_text(child) + continue + + _collect_content(child, current_text, current_bullets, current_tables) + + # Flush final slide + _flush_slide(slides, current_title, current_subtitle, current_text, current_bullets, current_tables) + + # If no slides were created (no headings found), create a single slide from all content + if not slides and (current_text or current_bullets or current_tables): + slide: dict[str, Any] = {"title": "Untitled Slide"} + if current_text: + slide["textContent"] = current_text + if current_bullets: + slide["bulletPoints"] = current_bullets + if current_tables: + slide["tables"] = current_tables + slides.append(slide) + + return slides diff --git a/services/db/init-scripts/03-create-knowledge-database.sql b/services/db/init-scripts/03-create-knowledge-database.sql index 06d9749c34..d9d73d1398 100644 --- a/services/db/init-scripts/03-create-knowledge-database.sql +++ b/services/db/init-scripts/03-create-knowledge-database.sql @@ -171,6 +171,8 @@ CREATE TABLE IF NOT EXISTS private_knowledge.documents ( status TEXT NOT NULL DEFAULT 'processing' CHECK (status IN ('processing', 'completed', 'failed')), error TEXT, chunks_count INTEGER NOT NULL DEFAULT 0, + progress_phase TEXT, + progress_detail TEXT, source_created_at TIMESTAMPTZ, source_modified_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), diff --git a/services/db/migrations/db/migrations/20260408000001_create_semantic_cache_table.sql b/services/db/migrations/db/migrations/20260408000001_create_semantic_cache_table.sql index 65d7a53f47..0d6459ef38 100644 --- a/services/db/migrations/db/migrations/20260408000001_create_semantic_cache_table.sql +++ b/services/db/migrations/db/migrations/20260408000001_create_semantic_cache_table.sql @@ -5,7 +5,7 @@ CREATE TABLE IF NOT EXISTS private_knowledge.semantic_cache ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), query_text TEXT NOT NULL, - query_embedding vector NOT NULL, + query_embedding vector, response_text TEXT NOT NULL, metadata JSONB DEFAULT '{}'::jsonb, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), @@ -14,10 +14,8 @@ CREATE TABLE IF NOT EXISTS private_knowledge.semantic_cache ( file_ids TEXT[] DEFAULT '{}' ); --- HNSW index for fast cosine similarity lookups -CREATE INDEX IF NOT EXISTS idx_semantic_cache_embedding - ON private_knowledge.semantic_cache - USING hnsw (query_embedding vector_cosine_ops); +-- NOTE: HNSW index on query_embedding is created at runtime by the RAG +-- service once the embedding dimensions are known (same pattern as chunks). -- B-tree index for TTL cleanup CREATE INDEX IF NOT EXISTS idx_semantic_cache_expires_at diff --git a/services/db/migrations/db/migrations/20260411000001_add_document_progress_columns.sql b/services/db/migrations/db/migrations/20260411000001_add_document_progress_columns.sql new file mode 100644 index 0000000000..619be703bb --- /dev/null +++ b/services/db/migrations/db/migrations/20260411000001_add_document_progress_columns.sql @@ -0,0 +1,12 @@ +-- migrate:up +-- Add progress tracking columns for document indexing status. + +ALTER TABLE private_knowledge.documents + ADD COLUMN IF NOT EXISTS progress_phase TEXT, + ADD COLUMN IF NOT EXISTS progress_detail TEXT; + +-- migrate:down + +ALTER TABLE private_knowledge.documents + DROP COLUMN IF EXISTS progress_phase, + DROP COLUMN IF EXISTS progress_detail; diff --git a/services/platform/app/features/chat/components/chat-input.tsx b/services/platform/app/features/chat/components/chat-input.tsx index cf08e087d9..ea4637e41b 100644 --- a/services/platform/app/features/chat/components/chat-input.tsx +++ b/services/platform/app/features/chat/components/chat-input.tsx @@ -43,6 +43,11 @@ interface ChatInputProps extends Omit< uploadFiles: (files: File[]) => Promise; removeAttachment: (fileId: Id<'_storage'>) => void; clearAttachments: () => FileAttachment[]; + isIndexing?: boolean; + indexingStatuses?: Map< + Id<'_storage'>, + { status?: string; error?: string; progress?: string } + >; } export function ChatInput({ @@ -60,6 +65,8 @@ export function ChatInput({ uploadFiles, removeAttachment, clearAttachments, + isIndexing = false, + indexingStatuses, ...restProps }: ChatInputProps) { const { t: tChat } = useT('chat'); @@ -84,7 +91,8 @@ export function ChatInput({ (!value.trim() && attachments.length === 0) || isLoading || disabled || - isUploading + isUploading || + isIndexing ) return; @@ -224,13 +232,58 @@ export function ChatInput({ {middleEllipsis(attachment.fileName, 28)} - - {formatFileSize(attachment.fileSize)} - + {(() => { + const info = indexingStatuses?.get(attachment.fileId); + const ragStatus = info?.status; + if (ragStatus === 'queued' || ragStatus === 'running') { + const raw = info?.progress; + // Convert "extracting 42/108" → "39%" + let progressLabel = tChat('indexing'); + if (raw) { + const match = /(\d+)\/(\d+)/.exec(raw); + if (match) { + const pct = Math.round( + (Number(match[1]) / Number(match[2])) * 100, + ); + progressLabel = `${pct}%`; + } else { + progressLabel = raw; + } + } + return ( + + + + {progressLabel} + + + ); + } + if (ragStatus === 'failed') { + return ( + + {tChat('indexingFailed')} + + ); + } + return ( + + {formatFileSize(attachment.fileSize)} + + ); + })()} + + ); +} + +interface SourceDetailDialogProps { + source: SourceGroup | null; + onClose: () => void; +} + +/** + * Normalize chunk content for display: + * - Convert literal `\n` sequences to real newlines + * - Collapse 3+ consecutive blank lines into 2 + */ +function normalizeContent(raw: string): string { + return raw + .replace(/\\n/g, '\n') + .replace(/\n{3,}/g, '\n\n') + .trim(); +} + +function SourceDetailDialog({ source, onClose }: SourceDetailDialogProps) { + const { t } = useT('chat'); + if (!source) return null; + + const title = + source.filename ?? + (source.url + ? getDomain(source.url) + : t('citations.source', { number: String(source.number) })); + + const chunkCount = source.chunks.length; return ( - + ); } @@ -70,74 +184,37 @@ interface SourceCardsProps { function SourceCardsComponent({ citations }: SourceCardsProps) { const { t } = useT('chat'); - const organizationId = useOrganizationId(); const [isExpanded, setIsExpanded] = useState(false); - const [previewDocId, setPreviewDocId] = useState(); - const [previewFileName, setPreviewFileName] = useState(); - - const citationList = getUniqueCitations(citations); - - // Collect all RAG fileIds to batch-query file metadata - const ragFileIds = useMemo(() => { - const ids: Id<'_storage'>[] = []; - for (const c of citationList) { - if (c.type === 'rag' && c.fileId) { - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- fileId from RAG metadata is a Convex storage ID string - ids.push(c.fileId as Id<'_storage'>); - } - } - return ids; - }, [citationList]); - - const { data: fileMetadataList } = useConvexQuery( - api.file_metadata.queries.getByStorageIds, - ragFileIds.length > 0 ? { storageIds: ragFileIds } : 'skip', + const [selectedSource, setSelectedSource] = useState( + null, ); - // Map storageId → documentId for quick lookup - const storageToDocId = useMemo(() => { - const map = new Map(); - if (fileMetadataList) { - for (const meta of fileMetadataList) { - if (meta.documentId) { - map.set(meta.storageId, meta.documentId); - } - } + const sourceList = getUniqueSources(citations); + + const handleCardClick = useCallback((source: SourceGroup) => { + if (source.type === 'web' && source.url) { + window.open(source.url, '_blank', 'noopener,noreferrer'); + } else { + setSelectedSource(source); } - return map; - }, [fileMetadataList]); - - const handleCardClick = useCallback( - (citation: CitationInfo) => { - if (citation.type === 'web' && citation.url) { - window.open(citation.url, '_blank', 'noopener,noreferrer'); - } else if (citation.type === 'rag' && citation.fileId) { - const docId = storageToDocId.get(citation.fileId); - if (docId) { - setPreviewDocId(docId); - setPreviewFileName(citation.filename); - } - } - }, - [storageToDocId], - ); + }, []); - if (citationList.length === 0) return null; + if (sourceList.length === 0) return null; - const needsCollapse = citationList.length > COLLAPSED_LIMIT; - const visibleCitations = + const needsCollapse = sourceList.length > COLLAPSED_LIMIT; + const visibleSources = needsCollapse && !isExpanded - ? citationList.slice(0, COLLAPSED_LIMIT) - : citationList; + ? sourceList.slice(0, COLLAPSED_LIMIT) + : sourceList; return (
    - {visibleCitations.map((citation) => ( + {visibleSources.map((source) => ( handleCardClick(citation)} + key={source.number} + source={source} + onClick={() => handleCardClick(source)} /> ))}
    @@ -156,24 +233,17 @@ function SourceCardsComponent({ citations }: SourceCardsProps) { <> {t('citations.showAllSources', { - count: String(citationList.length), + count: String(sourceList.length), })} )} )} - {organizationId && ( - { - if (!open) setPreviewDocId(undefined); - }} - organizationId={organizationId} - documentId={previewDocId} - fileName={previewFileName} - /> - )} + setSelectedSource(null)} + />
    ); } diff --git a/services/platform/app/features/chat/hooks/queries.ts b/services/platform/app/features/chat/hooks/queries.ts index fbd1501aec..bb4ad3c318 100644 --- a/services/platform/app/features/chat/hooks/queries.ts +++ b/services/platform/app/features/chat/hooks/queries.ts @@ -553,10 +553,13 @@ export function useMessageError(threadId: string | null) { return data ?? null; } -export function useMessageMetadata(messageId: string | null) { +export function useMessageMetadata( + messageId: string | null, + threadId?: string | null, +) { const { data: metadata, isLoading } = useConvexQuery( api.message_metadata.queries.getMessageMetadata, - messageId ? { messageId } : 'skip', + messageId ? { messageId, ...(threadId ? { threadId } : {}) } : 'skip', ); return { diff --git a/services/platform/app/features/chat/hooks/use-citations.ts b/services/platform/app/features/chat/hooks/use-citations.ts index 09026ef51f..7f7d197caa 100644 --- a/services/platform/app/features/chat/hooks/use-citations.ts +++ b/services/platform/app/features/chat/hooks/use-citations.ts @@ -8,10 +8,13 @@ export interface CitationInfo { relevance?: number; url?: string; type: 'rag' | 'web'; + /** Chunk text content extracted from tool output. */ + content?: string; } interface ToolUsageInput { toolName: string; + input?: string; output?: string; } @@ -22,14 +25,35 @@ const WEB_CITATION_PATTERN = /\[(\d+)\]\s*\(Relevance:\s*([\d.]+)%\)(?:\s*\[Source:\s*([^\]]+)\])?(?:\s*\[URL:\s*([^\]]+)\])?/g; /** - * Parse citation metadata from RAG search result text. + * Parse citation metadata and chunk content from RAG search result text. + * + * The RAG output format is: + * ``` + * [1] (Relevance: 87.3%) [Source: report.pdf] [Page: 5] [FileID: abc] + * + * + * --- + * + * [2] (Relevance: 72.1%) [Source: memo.docx] [FileID: def] + * + * ``` */ export function parseRagCitations(text: string): Map { const citations = new Map(); - let match; - RAG_CITATION_PATTERN.lastIndex = 0; - while ((match = RAG_CITATION_PATTERN.exec(text)) !== null) { + + // Split by chunk separator to get individual chunks + const chunks = text.split(/\n\n---\n\n/); + + for (const chunk of chunks) { + RAG_CITATION_PATTERN.lastIndex = 0; + const match = RAG_CITATION_PATTERN.exec(chunk); + if (!match) continue; + const num = parseInt(match[1], 10); + // Content is everything after the metadata line + const metadataEnd = (match.index ?? 0) + match[0].length; + const content = chunk.slice(metadataEnd).trim() || undefined; + citations.set(num, { number: num, relevance: parseFloat(match[2]), @@ -37,6 +61,7 @@ export function parseRagCitations(text: string): Map { page: match[4] ? parseInt(match[4], 10) : undefined, fileId: match[5] || undefined, type: 'rag', + content, }); } return citations; @@ -62,45 +87,152 @@ export function parseWebCitations(text: string): Map { return citations; } +/** + * Try to unwrap safeStringify'd output — handles both JSON-wrapped + * strings and nested objects with a `response` or `output` field. + */ +function unwrapOutput(raw: string): string { + let output = raw; + + // Unwrap JSON-wrapped string: "\"...\"" + if (output.startsWith('"') && output.endsWith('"')) { + try { + const parsed: unknown = JSON.parse(output); + if (typeof parsed === 'string') { + output = parsed; + } + } catch { + // use as-is + } + } + + return output; +} + +function isPlainObject(val: unknown): val is Record { + return val !== null && typeof val === 'object' && !Array.isArray(val); +} + +interface JsonFieldsResult { + response?: string; + filename?: string; + fileId?: string; +} + +/** + * Extract metadata fields from a JSON tool output string. + * Handles both direct objects and nested `{ value: { ... } }` wrappers. + */ +function extractJsonFields(output: string): JsonFieldsResult | undefined { + try { + const parsed: unknown = JSON.parse(output); + if (!isPlainObject(parsed)) return undefined; + + // Check nested value wrapper (tool-result shape) + const obj = isPlainObject(parsed.value) ? parsed.value : parsed; + + const response = + typeof obj.response === 'string' + ? obj.response + : typeof obj.output === 'string' + ? obj.output + : undefined; + // filename field (retrieve), or title as fallback + const filename = + typeof obj.filename === 'string' + ? obj.filename + : typeof obj.title === 'string' + ? obj.title + : undefined; + const fileId = typeof obj.fileId === 'string' ? obj.fileId : undefined; + + return response || filename || fileId + ? { response, filename, fileId } + : undefined; + } catch { + // not JSON + } + return undefined; +} + +/** + * Detect whether a rag_search tool call is a 'retrieve' operation + * by examining its input. Returns parsed input data if it is. + */ +function parseRetrieveInput( + inputStr: string | undefined, +): { fileId: string } | undefined { + if (!inputStr) return undefined; + try { + const parsed: unknown = JSON.parse(inputStr); + if ( + isPlainObject(parsed) && + parsed.operation === 'retrieve' && + typeof parsed.fileId === 'string' + ) { + return { fileId: parsed.fileId }; + } + } catch { + // not JSON + } + return undefined; +} + /** * Parse citations from tool usage records. * - * Processes RAG and web tool outputs in order, offsetting web citation - * numbers by the max RAG citation number to avoid collisions. + * Processes RAG search and retrieve operations plus web tool outputs, + * offsetting citation numbers between successive calls to avoid collisions. */ export function parseCitationsFromToolsUsage( toolsUsage: ToolUsageInput[], ): Map { const allCitations = new Map(); - let maxNumber = 0; + let nextNumber = 1; for (const usage of toolsUsage) { if (!usage.output) continue; - // toolsUsage.output is safeStringify'd — unwrap if it's a JSON string - let output = usage.output; - if (output.startsWith('"') && output.endsWith('"')) { - try { - const parsed: unknown = JSON.parse(output); - if (typeof parsed === 'string') { - output = parsed; - } - } catch { - // use as-is - } - } + const output = unwrapOutput(usage.output); if (usage.toolName === 'rag_search') { - const ragCitations = parseRagCitations(output); - for (const [num, citation] of ragCitations) { - allCitations.set(num, citation); - if (num > maxNumber) maxNumber = num; + const fields = extractJsonFields(output); + // First try to parse as formatted search results ([N] Relevance: ...) + const responseText = fields?.response ?? output; + const ragCitations = parseRagCitations(responseText); + + if (ragCitations.size > 0) { + // Offset all numbers so successive rag_search calls don't collide + const offset = nextNumber - 1; + for (const [, citation] of ragCitations) { + const newNum = citation.number + offset; + allCitations.set(newNum, { ...citation, number: newNum }); + if (newNum >= nextNumber) nextNumber = newNum + 1; + } + } else { + // No formatted citations — could be a retrieve operation + const retrieveInput = parseRetrieveInput(usage.input); + if (retrieveInput) { + const content = fields?.response ?? output; + if (content && content !== 'Document has no text content.') { + allCitations.set(nextNumber, { + number: nextNumber, + fileId: fields?.fileId ?? retrieveInput.fileId, + filename: fields?.filename ?? undefined, + type: 'rag', + content, + }); + nextNumber++; + } + } } } else if (usage.toolName === 'web') { const webCitations = parseWebCitations(output); - for (const [originalNum, citation] of webCitations) { - const offsetNum = originalNum + maxNumber; - allCitations.set(offsetNum, { ...citation, number: offsetNum }); + const offset = nextNumber - 1; + for (const [, citation] of webCitations) { + const newNum = citation.number + offset; + allCitations.set(newNum, { ...citation, number: newNum }); + if (newNum >= nextNumber) nextNumber = newNum + 1; } } } @@ -127,9 +259,12 @@ function deduplicateCitations( const deduped = new Map(); for (const [num, citation] of citations) { + // Include a content fingerprint so different chunks from the same + // file/page are kept as separate entries. + const contentKey = citation.content?.slice(0, 80) ?? ''; const sourceKey = citation.type === 'rag' - ? `rag:${citation.fileId ?? ''}:${citation.page ?? ''}` + ? `rag:${citation.fileId ?? ''}:${citation.page ?? ''}:${contentKey}` : `web:${citation.url ?? ''}`; const existingNum = seen.get(sourceKey); @@ -146,28 +281,109 @@ function deduplicateCitations( return deduped; } +export interface ChunkDetail { + number: number; + page?: number; + relevance?: number; + content?: string; +} + +export interface SourceGroup { + /** The first citation number (used for ordering and as key). */ + number: number; + filename?: string; + fileId?: string; + url?: string; + type: 'rag' | 'web'; + /** All inline citation numbers that reference this source. */ + chunkNumbers: number[]; + /** All distinct page numbers referenced (RAG only). */ + pages: number[]; + /** Highest relevance score among the grouped citations. */ + relevance?: number; + /** Individual chunk details with content for display. */ + chunks: ChunkDetail[]; +} + /** - * Get unique citations for display in source cards (no duplicates). + * Group citations by source (fileId for RAG, url for web) for display + * in source cards. Same file with different pages/chunks is merged + * into a single entry. */ -export function getUniqueCitations( +export function getUniqueSources( citations: Map, -): CitationInfo[] { - const seen = new Set(); - const unique: CitationInfo[] = []; +): SourceGroup[] { + const groups = new Map(); + // Track which original citation numbers we've already added as chunks + // to avoid duplicating content when deduplicateCitations maps multiple + // keys to the same citation object. + const addedChunkIds = new Map>(); - for (const citation of citations.values()) { + for (const [mapKey, citation] of citations) { const sourceKey = citation.type === 'rag' - ? `rag:${citation.fileId ?? ''}:${citation.page ?? ''}` + ? `rag:${citation.fileId ?? ''}` : `web:${citation.url ?? ''}`; - if (!seen.has(sourceKey)) { - seen.add(sourceKey); - unique.push(citation); + // Use the Map key as the inline reference number (the [N] in the text), + // since deduplicateCitations may remap multiple keys to the same citation. + const inlineNumber = mapKey; + + const existing = groups.get(sourceKey); + if (existing) { + existing.chunkNumbers.push(inlineNumber); + + // Only add chunk detail if this is a genuinely different chunk + // (not a remapped duplicate pointing to the same original citation) + let chunkSet = addedChunkIds.get(sourceKey); + if (!chunkSet) { + chunkSet = new Set(); + addedChunkIds.set(sourceKey, chunkSet); + } + if (!chunkSet.has(citation.number)) { + chunkSet.add(citation.number); + existing.chunks.push({ + number: citation.number, + page: citation.page, + relevance: citation.relevance, + content: citation.content, + }); + if (citation.page != null && !existing.pages.includes(citation.page)) { + existing.pages.push(citation.page); + } + if ( + citation.relevance != null && + (existing.relevance == null || + citation.relevance > existing.relevance) + ) { + existing.relevance = citation.relevance; + } + } + } else { + const chunkSet = new Set([citation.number]); + addedChunkIds.set(sourceKey, chunkSet); + groups.set(sourceKey, { + number: inlineNumber, + filename: citation.filename, + fileId: citation.fileId, + url: citation.url, + type: citation.type, + chunkNumbers: [inlineNumber], + pages: citation.page != null ? [citation.page] : [], + relevance: citation.relevance, + chunks: [ + { + number: citation.number, + page: citation.page, + relevance: citation.relevance, + content: citation.content, + }, + ], + }); } } - return unique.sort((a, b) => a.number - b.number); + return Array.from(groups.values()).sort((a, b) => a.number - b.number); } /** diff --git a/services/platform/app/features/chat/hooks/use-file-indexing-status.ts b/services/platform/app/features/chat/hooks/use-file-indexing-status.ts new file mode 100644 index 0000000000..472dc63d73 --- /dev/null +++ b/services/platform/app/features/chat/hooks/use-file-indexing-status.ts @@ -0,0 +1,98 @@ +'use client'; + +import { useAction } from 'convex/react'; +import { useQuery } from 'convex/react'; +import { useEffect, useMemo, useRef } from 'react'; + +import { api } from '@/convex/_generated/api'; +import type { Id } from '@/convex/_generated/dataModel'; + +import type { FileAttachment } from './use-convex-file-upload'; + +type RagStatus = 'queued' | 'running' | 'completed' | 'failed'; + +interface FileIndexingInfo { + status?: RagStatus; + error?: string; + progress?: string; +} + +const POLL_INTERVAL_MS = 3_000; + +/** + * Query RAG indexing status for non-image file attachments. + * + * - Reactive Convex query for instant UI updates when status changes. + * - Client-side polling: calls checkFileRagStatuses action every 3s + * while any file is in queued/running state. Polling stops automatically + * when the user leaves the page or all files finish indexing. + */ +export function useFileIndexingStatus(attachments: FileAttachment[]) { + const fileIds = useMemo( + () => + attachments + .filter((a) => !a.fileType.startsWith('image/')) + .map((a) => a.fileId), + [attachments], + ); + + const metadata = useQuery( + api.file_metadata.queries.getByStorageIds, + fileIds.length > 0 ? { storageIds: fileIds } : 'skip', + ); + + const statusMap = useMemo(() => { + const map = new Map, FileIndexingInfo>(); + if (!metadata) return map; + for (const m of metadata) { + map.set(m.storageId, { + status: m.ragStatus, + error: m.ragError, + progress: m.ragProgress, + }); + } + return map; + }, [metadata]); + + const isIndexing = useMemo(() => { + if (!metadata || fileIds.length === 0) return false; + return metadata.some( + (m) => m.ragStatus === 'queued' || m.ragStatus === 'running', + ); + }, [metadata, fileIds.length]); + + // IDs of files that still need polling + const pendingIds = useMemo(() => { + if (!metadata) return []; + return metadata + .filter((m) => m.ragStatus === 'queued' || m.ragStatus === 'running') + .map((m) => m.storageId); + }, [metadata]); + + // Client-side polling: call the action periodically while files are pending + const checkStatuses = useAction( + api.file_metadata.actions.checkFileRagStatuses, + ); + const pollingRef = useRef(false); + + useEffect(() => { + if (pendingIds.length === 0) return undefined; + + pollingRef.current = true; + + // Trigger immediately, then poll on interval + checkStatuses({ storageIds: pendingIds }).catch(() => {}); + + const timer = setInterval(() => { + if (!pollingRef.current) return; + checkStatuses({ storageIds: pendingIds }).catch(() => {}); + }, POLL_INTERVAL_MS); + + return () => { + pollingRef.current = false; + clearInterval(timer); + }; + }, [pendingIds, checkStatuses]); + + return { isIndexing, statusMap }; +} diff --git a/services/platform/app/features/chat/hooks/use-message-processing.ts b/services/platform/app/features/chat/hooks/use-message-processing.ts index 459bbd7f3b..7665cfde9c 100644 --- a/services/platform/app/features/chat/hooks/use-message-processing.ts +++ b/services/platform/app/features/chat/hooks/use-message-processing.ts @@ -271,7 +271,14 @@ export function useMessageProcessing( m.role === 'assistant' && m.status === 'failed' && !m.text?.trim(), isFailed: m.role === 'assistant' && m.status === 'failed' && !!m.text?.trim(), - error: messageErrors?.[m.id], + error: + messageErrors?.[m.id] ?? + // UIMessage.id is the first message in a group, but the error + // lives on the last (failed) message which has a different _id. + // Fall back to any error in the map for this failed message. + (m.status === 'failed' && messageErrors + ? Object.values(messageErrors)[0] + : undefined), systemMessageDisplay, systemMessageBody, }; diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts index 277ee09f5d..f241ec6b2f 100644 --- a/services/platform/convex/_generated/api.d.ts +++ b/services/platform/convex/_generated/api.d.ts @@ -43,11 +43,8 @@ import type * as agent_tools_files_docx_tool from "../agent_tools/files/docx_too import type * as agent_tools_files_excel_tool from "../agent_tools/files/excel_tool.js"; import type * as agent_tools_files_helpers_analyze_image from "../agent_tools/files/helpers/analyze_image.js"; import type * as agent_tools_files_helpers_analyze_image_by_url from "../agent_tools/files/helpers/analyze_image_by_url.js"; -import type * as agent_tools_files_helpers_analyze_text from "../agent_tools/files/helpers/analyze_text.js"; import type * as agent_tools_files_helpers_append_file_part from "../agent_tools/files/helpers/append_file_part.js"; import type * as agent_tools_files_helpers_get_agent_model from "../agent_tools/files/helpers/get_agent_model.js"; -import type * as agent_tools_files_helpers_parse_file from "../agent_tools/files/helpers/parse_file.js"; -import type * as agent_tools_files_helpers_resolve_file_name from "../agent_tools/files/helpers/resolve_file_name.js"; import type * as agent_tools_files_helpers_vision_agent from "../agent_tools/files/helpers/vision_agent.js"; import type * as agent_tools_files_image_tool from "../agent_tools/files/image_tool.js"; import type * as agent_tools_files_internal_actions from "../agent_tools/files/internal_actions.js"; @@ -81,6 +78,7 @@ import type * as agent_tools_products_helpers_read_product_list from "../agent_t import type * as agent_tools_products_helpers_types from "../agent_tools/products/helpers/types.js"; import type * as agent_tools_products_product_read_tool from "../agent_tools/products/product_read_tool.js"; import type * as agent_tools_rag_format_search_results from "../agent_tools/rag/format_search_results.js"; +import type * as agent_tools_rag_helpers_fetch_document_chunks from "../agent_tools/rag/helpers/fetch_document_chunks.js"; import type * as agent_tools_rag_helpers_list_indexed_documents from "../agent_tools/rag/helpers/list_indexed_documents.js"; import type * as agent_tools_rag_parse_search_results from "../agent_tools/rag/parse_search_results.js"; import type * as agent_tools_rag_query_rag_context from "../agent_tools/rag/query_rag_context.js"; @@ -240,8 +238,6 @@ import type * as documents_find_document_by_title from "../documents/find_docume import type * as documents_generate_document from "../documents/generate_document.js"; import type * as documents_generate_document_helpers from "../documents/generate_document_helpers.js"; import type * as documents_generate_docx from "../documents/generate_docx.js"; -import type * as documents_generate_docx_from_template from "../documents/generate_docx_from_template.js"; -import type * as documents_generate_pptx from "../documents/generate_pptx.js"; import type * as documents_generate_signed_url from "../documents/generate_signed_url.js"; import type * as documents_get_accessible_document_ids from "../documents/get_accessible_document_ids.js"; import type * as documents_get_agent_scoped_file_ids from "../documents/get_agent_scoped_file_ids.js"; @@ -256,7 +252,6 @@ import type * as documents_helpers from "../documents/helpers.js"; import type * as documents_internal_actions from "../documents/internal_actions.js"; import type * as documents_internal_mutations from "../documents/internal_mutations.js"; import type * as documents_internal_queries from "../documents/internal_queries.js"; -import type * as documents_list_documents_by_extension from "../documents/list_documents_by_extension.js"; import type * as documents_list_documents_for_agent from "../documents/list_documents_for_agent.js"; import type * as documents_list_documents_paginated from "../documents/list_documents_paginated.js"; import type * as documents_list_indexed_documents_for_agent from "../documents/list_indexed_documents_for_agent.js"; @@ -276,7 +271,9 @@ import type * as documents_upload_base64_to_storage from "../documents/upload_ba import type * as documents_validators from "../documents/validators.js"; import type * as feedback_mutations from "../feedback/mutations.js"; import type * as feedback_queries from "../feedback/queries.js"; +import type * as file_metadata_actions from "../file_metadata/actions.js"; import type * as file_metadata_helpers from "../file_metadata/helpers.js"; +import type * as file_metadata_internal_actions from "../file_metadata/internal_actions.js"; import type * as file_metadata_internal_mutations from "../file_metadata/internal_mutations.js"; import type * as file_metadata_internal_queries from "../file_metadata/internal_queries.js"; import type * as file_metadata_mutations from "../file_metadata/mutations.js"; @@ -969,11 +966,8 @@ declare const fullApi: ApiFromModules<{ "agent_tools/files/excel_tool": typeof agent_tools_files_excel_tool; "agent_tools/files/helpers/analyze_image": typeof agent_tools_files_helpers_analyze_image; "agent_tools/files/helpers/analyze_image_by_url": typeof agent_tools_files_helpers_analyze_image_by_url; - "agent_tools/files/helpers/analyze_text": typeof agent_tools_files_helpers_analyze_text; "agent_tools/files/helpers/append_file_part": typeof agent_tools_files_helpers_append_file_part; "agent_tools/files/helpers/get_agent_model": typeof agent_tools_files_helpers_get_agent_model; - "agent_tools/files/helpers/parse_file": typeof agent_tools_files_helpers_parse_file; - "agent_tools/files/helpers/resolve_file_name": typeof agent_tools_files_helpers_resolve_file_name; "agent_tools/files/helpers/vision_agent": typeof agent_tools_files_helpers_vision_agent; "agent_tools/files/image_tool": typeof agent_tools_files_image_tool; "agent_tools/files/internal_actions": typeof agent_tools_files_internal_actions; @@ -1007,6 +1001,7 @@ declare const fullApi: ApiFromModules<{ "agent_tools/products/helpers/types": typeof agent_tools_products_helpers_types; "agent_tools/products/product_read_tool": typeof agent_tools_products_product_read_tool; "agent_tools/rag/format_search_results": typeof agent_tools_rag_format_search_results; + "agent_tools/rag/helpers/fetch_document_chunks": typeof agent_tools_rag_helpers_fetch_document_chunks; "agent_tools/rag/helpers/list_indexed_documents": typeof agent_tools_rag_helpers_list_indexed_documents; "agent_tools/rag/parse_search_results": typeof agent_tools_rag_parse_search_results; "agent_tools/rag/query_rag_context": typeof agent_tools_rag_query_rag_context; @@ -1166,8 +1161,6 @@ declare const fullApi: ApiFromModules<{ "documents/generate_document": typeof documents_generate_document; "documents/generate_document_helpers": typeof documents_generate_document_helpers; "documents/generate_docx": typeof documents_generate_docx; - "documents/generate_docx_from_template": typeof documents_generate_docx_from_template; - "documents/generate_pptx": typeof documents_generate_pptx; "documents/generate_signed_url": typeof documents_generate_signed_url; "documents/get_accessible_document_ids": typeof documents_get_accessible_document_ids; "documents/get_agent_scoped_file_ids": typeof documents_get_agent_scoped_file_ids; @@ -1182,7 +1175,6 @@ declare const fullApi: ApiFromModules<{ "documents/internal_actions": typeof documents_internal_actions; "documents/internal_mutations": typeof documents_internal_mutations; "documents/internal_queries": typeof documents_internal_queries; - "documents/list_documents_by_extension": typeof documents_list_documents_by_extension; "documents/list_documents_for_agent": typeof documents_list_documents_for_agent; "documents/list_documents_paginated": typeof documents_list_documents_paginated; "documents/list_indexed_documents_for_agent": typeof documents_list_indexed_documents_for_agent; @@ -1202,7 +1194,9 @@ declare const fullApi: ApiFromModules<{ "documents/validators": typeof documents_validators; "feedback/mutations": typeof feedback_mutations; "feedback/queries": typeof feedback_queries; + "file_metadata/actions": typeof file_metadata_actions; "file_metadata/helpers": typeof file_metadata_helpers; + "file_metadata/internal_actions": typeof file_metadata_internal_actions; "file_metadata/internal_mutations": typeof file_metadata_internal_mutations; "file_metadata/internal_queries": typeof file_metadata_internal_queries; "file_metadata/mutations": typeof file_metadata_mutations; diff --git a/services/platform/convex/agent_tools/files/docx_tool.ts b/services/platform/convex/agent_tools/files/docx_tool.ts index ded9de8122..53e15c8931 100644 --- a/services/platform/convex/agent_tools/files/docx_tool.ts +++ b/services/platform/convex/agent_tools/files/docx_tool.ts @@ -1,6 +1,5 @@ /** Convex Tool: DOCX - * Generate Word (.docx) documents and work with DOCX templates in the documents schema. - * Parse DOCX documents to extract text content. + * Generate Word (.docx) documents from markdown/HTML or structured sections. */ import type { ToolCtx } from '@convex-dev/agent'; @@ -8,29 +7,13 @@ import { createTool } from '@convex-dev/agent'; import { z } from 'zod/v4'; import { internal } from '../../_generated/api'; -import type { ListDocumentsByExtensionResult } from '../../documents/types'; import { createDebugLog } from '../../lib/debug_log'; -import { toId } from '../../lib/type_cast_helpers'; import type { ToolDefinition } from '../types'; import { appendFilePart } from './helpers/append_file_part'; -import { getAgentModelId } from './helpers/get_agent_model'; -import { parseFile, type ParseFileResult } from './helpers/parse_file'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); // Result types -interface ListTemplatesResult { - operation: 'list_templates'; - success: boolean; - templates: Array<{ - fileId: string; - title: string; - createdAt: number; - }>; - totalCount: number; - message: string; -} - interface GenerateDocxResult { operation: 'generate'; success: boolean; @@ -41,9 +24,7 @@ interface GenerateDocxResult { size: number; } -type ParseDocxResult = { operation: 'parse' } & ParseFileResult; - -type DocxResult = ListTemplatesResult | GenerateDocxResult | ParseDocxResult; +type DocxResult = GenerateDocxResult; const sectionSchema = z.object({ type: z @@ -80,23 +61,8 @@ const sectionSchema = z.object({ }); const docxArgs = z.discriminatedUnion('operation', [ - z.object({ - operation: z.literal('list_templates'), - limit: z - .number() - .optional() - .describe( - 'Maximum number of DOCX documents/templates to return (default: 50)', - ), - }), z.object({ operation: z.literal('generate'), - templateStorageId: z - .string() - .optional() - .describe( - 'Convex storage ID of a DOCX template. When provided, the template is used as base, preserving headers, footers, fonts, and page setup.', - ), fileName: z .string() .describe('Base name for the DOCX file (without extension)'), @@ -121,41 +87,22 @@ const docxArgs = z.discriminatedUnion('operation', [ 'Markdown or HTML text content. Use with sourceType. This is the fastest way to generate DOCX from the same content used for PDF generation.', ), }), - z.object({ - operation: z.literal('parse'), - fileId: z - .string() - .describe( - "Convex storage ID (e.g., 'kg2bazp7fbgt9srq63knfagjrd7yfenj'). Get this from the file attachment context.", - ), - filename: z - .string() - .optional() - .describe( - "Original filename (e.g., 'document.docx'). Optional — auto-resolved from file metadata if omitted.", - ), - user_input: z - .string() - .describe( - "The user's question or instruction about the document content", - ), - }), ]); export const docxTool = { name: 'docx' as const, tool: createTool({ - description: `Word document (DOCX) tool for listing templates, generating, and parsing documents. + description: `Word document (DOCX) tool for generating documents. IMPORTANT: Only call the "generate" operation when the user explicitly requests creating or exporting a Word/DOCX file. Do NOT proactively generate Word documents unless the user specifically asks for this format. -OPERATIONS: +TO READ WORD/DOCX FILE CONTENT: Do NOT use this tool. Instead use the rag_search tool: +• To get the full content of a DOCX file: use rag_search with operation='retrieve' and the fileId +• To search for specific information across DOCX files: use rag_search with operation='search' -1. list_templates - List all available DOCX templates - Returns all DOCX documents available in the organization. - Returns: { templates, totalCount, message } +OPERATIONS: -2. generate - Generate a DOCX document +1. generate - Generate a DOCX document TWO MODES: @@ -171,26 +118,14 @@ OPERATIONS: b) From structured sections: Use sections array for fine-grained control over document structure. - Pass templateStorageId to use a template as base. Parameters: - - fileName, title, subtitle, sections, templateStorageId + - fileName, title, subtitle, sections Returns: { success, downloadUrl, fileName, contentType, size } -3. parse - Extract text content from an existing DOCX file - USE THIS when a user uploads a DOCX and you need to read its content. - Parameters: - - fileId: **REQUIRED** - Convex storage ID (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj") - - filename: Optional — original filename (e.g., "document.docx"). Auto-resolved from file metadata if omitted. - - user_input: **REQUIRED** - The user's question or instruction about the document - Returns: { success, full_text, paragraph_count, metadata } - EXAMPLES: • From markdown: { "operation": "generate", "fileName": "report", "sourceType": "markdown", "content": "# Report\\n..." } • From HTML: { "operation": "generate", "fileName": "report", "sourceType": "html", "content": "

    Report

    ..." } • From sections: { "operation": "generate", "fileName": "report", "sections": [...] } -• With template: { "operation": "generate", "templateStorageId": "kg...", "fileName": "report", "sections": [...] } -• List templates: { "operation": "list_templates" } -• Parse: { "operation": "parse", "fileId": "kg2bazp7...", "filename": "document.docx", "user_input": "Extract the main points" } AFTER GENERATING: Check the downloadUrl in the result: - If it says "[file card shown in chat]": the file is already visible as a download card. Do NOT mention downloading, do NOT include a link, and do NOT say "you can download it" — the card handles this. @@ -201,78 +136,6 @@ To also save the file to a folder in the documents hub, call document_write with execute: async (ctx: ToolCtx, args): Promise => { const { organizationId } = ctx; - if (args.operation === 'list_templates') { - if (!organizationId) { - return { - operation: 'list_templates', - success: false, - templates: [], - totalCount: 0, - message: - 'No organizationId in context - cannot list DOCX templates. This tool requires organizationId to be set.', - }; - } - - debugLog('tool:docx list_templates start', { - organizationId, - limit: args.limit, - }); - - try { - const documents: ListDocumentsByExtensionResult = await ctx.runQuery( - internal.documents.internal_queries.listDocumentsByExtension, - { - organizationId, - extension: 'docx', - limit: args.limit, - }, - ); - - const templates = documents - .filter( - (doc): doc is typeof doc & { fileId: string } => !!doc.fileId, - ) - .map((doc) => ({ - fileId: doc.fileId, - title: doc.title ?? 'Untitled Document', - createdAt: doc._creationTime, - })); - - debugLog('tool:docx list_templates success', { - totalCount: templates.length, - }); - - return { - operation: 'list_templates', - success: true, - templates, - totalCount: templates.length, - message: - templates.length > 0 - ? `Found ${templates.length} DOCX template(s). Use the fileId when referencing these templates.` - : 'No DOCX templates found. Upload a DOCX file first to use it as a template.', - }; - } catch (error) { - console.error('[tool:docx list_templates] error', { - error: error instanceof Error ? error.message : String(error), - }); - throw error; - } - } - - if (args.operation === 'parse') { - const model = getAgentModelId(ctx); - const result = await parseFile( - ctx, - args.fileId, - args.filename, - 'docx', - args.user_input, - model, - ); - return { operation: 'parse', ...result }; - } - // operation === 'generate' if (!organizationId) { throw new Error('organizationId is required to generate a document'); @@ -363,50 +226,11 @@ To also save the file to a folder in the documents hub, call document_write with debugLog('tool:docx generate start', { fileName: args.fileName, sectionsCount: args.sections.length, - hasTemplate: !!args.templateStorageId, }); try { const sections = args.sections ?? []; - // If templateStorageId is provided, use template-based generation - if (args.templateStorageId) { - const result = await ctx.runAction( - internal.documents.internal_actions.generateDocxFromTemplate, - { - organizationId, - fileName: args.fileName, - content: { - title: args.title, - subtitle: args.subtitle, - sections, - }, - templateStorageId: toId<'_storage'>(args.templateStorageId), - }, - ); - - debugLog('tool:docx generate (from template) success', { - fileName: result.fileName, - fileStorageId: result.fileStorageId, - size: result.size, - }); - - const cardAppended = await appendFilePart(ctx, { - fileName: result.fileName, - mimeType: result.contentType, - downloadUrl: result.downloadUrl, - }); - - return { - operation: 'generate', - ...result, - downloadUrl: cardAppended - ? '[file card shown in chat]' - : result.downloadUrl, - } as GenerateDocxResult; - } - - // Otherwise, generate from scratch const result = await ctx.runAction( internal.documents.internal_actions.generateDocx, { diff --git a/services/platform/convex/agent_tools/files/excel_tool.ts b/services/platform/convex/agent_tools/files/excel_tool.ts index 54a33c8ed7..dd87aaa96b 100644 --- a/services/platform/convex/agent_tools/files/excel_tool.ts +++ b/services/platform/convex/agent_tools/files/excel_tool.ts @@ -1,6 +1,5 @@ /** Convex Tool: Excel * Generate Excel (.xlsx) files from tabular data. - * Parse Excel files to extract structured content. */ import type { ToolCtx } from '@convex-dev/agent'; @@ -10,10 +9,8 @@ import { z } from 'zod/v4'; import { internal } from '../../_generated/api'; import { createDebugLog } from '../../lib/debug_log'; import { buildDownloadUrl } from '../../lib/helpers/public_storage_url'; -import { toId } from '../../lib/type_cast_helpers'; import type { ToolDefinition } from '../types'; import { appendFilePart } from './helpers/append_file_part'; -import { resolveFileName } from './helpers/resolve_file_name'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); @@ -29,90 +26,53 @@ interface GenerateExcelResult { error?: string; } -interface ParseExcelResult { - operation: 'parse'; - success: boolean; - fileName: string; - sheets: Array<{ - name: string; - headers: string[]; - rows: Array>; - rowCount: number; - }>; - totalRows: number; - sheetCount: number; - error?: string; -} - -type ExcelResult = GenerateExcelResult | ParseExcelResult; - -const excelArgs = z.discriminatedUnion('operation', [ - z.object({ - operation: z.literal('generate'), - fileName: z - .string() - .describe('Base name for the Excel file (without extension)'), - sheets: z - .array( - z.object({ - name: z.string().describe('Sheet name'), - headers: z - .array(z.string()) - .nonempty() - .describe( - "Column headers for the sheet (must align with each row's columns)", - ), - rows: z - .array( - z.array(z.union([z.string(), z.number(), z.boolean(), z.null()])), - ) - .describe('2D array of cell values (rows x columns)'), - }), - ) - .describe('Sheets to include in the Excel file'), - }), - z.object({ - operation: z.literal('parse'), - fileId: z - .string() - .describe( - "Convex storage ID (e.g., 'kg2bazp7fbgt9srq63knfagjrd7yfenj'). Get this from the file attachment context.", - ), - filename: z - .string() - .optional() - .describe( - "Original filename (e.g., 'report.xlsx'). Optional — auto-resolved from file metadata if omitted.", - ), - }), -]); +const excelArgs = z.object({ + operation: z.literal('generate'), + fileName: z + .string() + .describe('Base name for the Excel file (without extension)'), + sheets: z + .array( + z.object({ + name: z.string().describe('Sheet name'), + headers: z + .array(z.string()) + .nonempty() + .describe( + "Column headers for the sheet (must align with each row's columns)", + ), + rows: z + .array( + z.array(z.union([z.string(), z.number(), z.boolean(), z.null()])), + ) + .describe('2D array of cell values (rows x columns)'), + }), + ) + .describe('Sheets to include in the Excel file'), +}); export const excelTool = { name: 'excel' as const, tool: createTool({ - description: `Excel (.xlsx) tool for generating and parsing spreadsheet files. + description: `Excel (.xlsx) tool for generating spreadsheet files. -IMPORTANT: Only call the "generate" operation when the user explicitly requests creating or exporting an Excel/spreadsheet file. Do NOT proactively generate Excel files unless the user specifically asks for this format. +IMPORTANT: Only call this tool when the user explicitly requests creating or exporting an Excel/spreadsheet file. Do NOT proactively generate Excel files unless the user specifically asks for this format. -OPERATIONS: +TO READ EXCEL FILE CONTENT: Do NOT use this tool. Instead use the rag_search tool: +• To get the full content of an Excel file: use rag_search with operation='retrieve' and the fileId +• To search for specific information across Excel files: use rag_search with operation='search' -1. generate - Generate an Excel file from structured tabular data - Use this when the user asks for an Excel/Spreadsheet export (e.g. customer lists, product tables, analytics). - Parameters: - - fileName: Base name for the Excel file (without extension) - - sheets: Array of sheets with names, headers, and rows - Returns: { success, downloadUrl, fileName, rowCount, sheetCount } +OPERATION: -2. parse - Extract structured data from an existing Excel file - USE THIS when a user uploads an Excel file and you need to read its content. - Parameters: - - fileId: **REQUIRED** - Convex storage ID (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj") - - filename: Optional — original filename (e.g., "report.xlsx"). Auto-resolved from file metadata if omitted. - Returns: { success, sheets (with headers and rows), totalRows, sheetCount } +generate - Generate an Excel file from structured tabular data + Use this when the user asks for an Excel/Spreadsheet export (e.g. customer lists, product tables, analytics). + Parameters: + - fileName: Base name for the Excel file (without extension) + - sheets: Array of sheets with names, headers, and rows + Returns: { success, downloadUrl, fileName, rowCount, sheetCount } -EXAMPLES: +EXAMPLE: • Generate: { "operation": "generate", "fileName": "customers", "sheets": [{ "name": "Sheet1", "headers": ["Name", "Email"], "rows": [["Alice", "alice@example.com"]] }] } -• Parse: { "operation": "parse", "fileId": "kg2bazp7...", "filename": "report.xlsx" } AFTER GENERATING: Check the downloadUrl in the result: - If it says "[file card shown in chat]": the file is already visible as a download card. Do NOT mention downloading, do NOT include a link, and do NOT say "you can download it" — the card handles this. @@ -120,61 +80,7 @@ AFTER GENERATING: Check the downloadUrl in the result: To also save the file to a folder in the documents hub, call document_write with the returned fileStorageId and the desired folderPath. `, inputSchema: excelArgs, - execute: async (ctx: ToolCtx, args): Promise => { - if (args.operation === 'parse') { - const resolvedFilename = await resolveFileName( - ctx, - args.fileId, - args.filename, - ); - - debugLog('tool:excel parse start', { - fileId: args.fileId, - filename: resolvedFilename, - }); - - try { - const result = await ctx.runAction( - internal.node_only.documents.internal_actions.parseExcel, - { - storageId: toId<'_storage'>(args.fileId), - }, - ); - - debugLog('tool:excel parse success', { - filename: resolvedFilename, - sheetCount: result.sheetCount, - totalRows: result.totalRows, - }); - - return { - operation: 'parse', - success: true, - fileName: resolvedFilename, - sheets: result.sheets, - totalRows: result.totalRows, - sheetCount: result.sheetCount, - }; - } catch (error) { - const message = - error instanceof Error ? error.message : String(error); - console.error('[tool:excel parse] error', { - fileId: args.fileId, - error: message, - }); - return { - operation: 'parse', - success: false, - fileName: resolvedFilename, - sheets: [], - totalRows: 0, - sheetCount: 0, - error: message, - }; - } - } - - // operation === 'generate' + execute: async (ctx: ToolCtx, args): Promise => { debugLog('tool:excel generate start', { fileName: args.fileName, sheetCount: args.sheets.length, diff --git a/services/platform/convex/agent_tools/files/helpers/analyze_text.ts b/services/platform/convex/agent_tools/files/helpers/analyze_text.ts deleted file mode 100644 index 41c0691c02..0000000000 --- a/services/platform/convex/agent_tools/files/helpers/analyze_text.ts +++ /dev/null @@ -1,467 +0,0 @@ -/** - * Helper for analyzing text files using the fast model. - * Handles encoding detection, chunking for large files, and LLM analysis. - * Uses ctx.storage.get() for direct Convex storage access (like analyze_image.ts). - * Uses Agent framework with saveMessages: 'none' to avoid creating visible thread messages. - */ - -import type { LanguageModelV3 } from '@ai-sdk/provider'; -import { Agent } from '@convex-dev/agent'; - -import { components } from '../../../_generated/api'; -import type { ActionCtx } from '../../../_generated/server'; -import { createDebugLog } from '../../../lib/debug_log'; -import { toId } from '../../../lib/type_cast_helpers'; - -const debugLog = createDebugLog('DEBUG_TEXT_ANALYSIS', '[TextAnalysis]'); - -const LLM_CHUNK_SIZE = 80 * 1024; // 80KB chunks for LLM processing -const MAX_TEXT_BYTES = 10 * 1024 * 1024; // 10MB max file size -const MAX_CONCURRENT_CHUNKS = 5; // Limit concurrent LLM requests to avoid rate limiting -const MAX_TOTAL_CHUNK_OUTPUT_CHARS = 30000; // Total chars budget for all chunk outputs combined -const MAX_FINAL_RESPONSE_CHARS = 10000; // Max chars for final aggregated response - -/** - * Process items with controlled concurrency (like p-map). - */ -async function mapWithConcurrency( - items: T[], - fn: (item: T, index: number) => Promise, - concurrency: number, -): Promise { - const results: R[] = new Array(items.length); - let nextIndex = 0; - - async function worker() { - while (nextIndex < items.length) { - const index = nextIndex++; - results[index] = await fn(items[index], index); - } - } - - const workers = Array.from( - { length: Math.min(concurrency, items.length) }, - () => worker(), - ); - await Promise.all(workers); - return results; -} -const SUPPORTED_ENCODINGS = [ - 'utf-8', - 'utf-16le', - 'utf-16be', - 'gbk', - 'gb2312', - 'big5', - 'shift_jis', - 'iso-8859-1', -]; - -export interface AnalyzeTextParams { - fileId: string; - filename: string; - userInput: string; - model: string; - languageModel: LanguageModelV3; -} - -export interface AnalyzeTextUsage { - inputTokens: number; - outputTokens: number; - totalTokens: number; -} - -export interface AnalyzeTextResult { - success: boolean; - result: string; - charCount: number; - lineCount: number; - encoding: string; - chunked: boolean; - chunkCount?: number; - model?: string; - usage?: AnalyzeTextUsage; - error?: string; -} - -function decodeWithEncoding(buffer: ArrayBuffer): { - text: string; - encoding: string; -} { - for (const encoding of SUPPORTED_ENCODINGS) { - try { - const decoder = new TextDecoder(encoding, { fatal: true }); - const text = decoder.decode(buffer); - if (text.length > 0 && !text.includes('\uFFFD')) { - return { text, encoding }; - } - } catch { - continue; - } - } - - const decoder = new TextDecoder('utf-8', { fatal: false }); - return { text: decoder.decode(buffer), encoding: 'utf-8 (fallback)' }; -} - -function isBinaryContent(text: string): boolean { - const sampleSize = Math.min(1000, text.length); - const sample = text.slice(0, sampleSize); - - let nullCount = 0; - let controlCount = 0; - - for (let i = 0; i < sample.length; i++) { - const code = sample.charCodeAt(i); - if (code === 0) nullCount++; - // Control chars (except tab, newline, carriage return) - if (code < 32 && code !== 9 && code !== 10 && code !== 13) controlCount++; - } - - const nullRatio = nullCount / sampleSize; - const controlRatio = controlCount / sampleSize; - - return nullRatio > 0.01 || controlRatio > 0.1; -} - -function splitIntoChunks(text: string, chunkSize: number): string[] { - const chunks: string[] = []; - let start = 0; - - while (start < text.length) { - let end = Math.min(start + chunkSize, text.length); - - // Try to break at a line boundary if not at the end - if (end < text.length) { - const lastNewline = text.lastIndexOf('\n', end); - if (lastNewline > start + chunkSize * 0.5) { - end = lastNewline + 1; - } - } - - chunks.push(text.slice(start, end)); - start = end; - } - - return chunks; -} - -const TEXT_ANALYSIS_INSTRUCTIONS = `You are a text analysis assistant. Your job is to analyze text content and answer the user's question accurately. - -Guidelines: -- Focus on answering the user's specific question -- Extract relevant information from the text -- Be concise but thorough -- If the text doesn't contain relevant information, say so clearly -- For large texts processed in chunks, focus on the most relevant parts`; - -function createTextAnalysisAgent(languageModel: LanguageModelV3): Agent { - const instructions = `${TEXT_ANALYSIS_INSTRUCTIONS}\n\nIf you use any tools, you must always conclude by producing a final assistant message with the answer.`; - - return new Agent(components.agent, { - name: 'text-analyzer', - languageModel, - instructions, - }); -} - -/** - * Generate unique userId for one-off analysis (messages won't be saved). - */ -function generateEphemeralUserId(): string { - return `text-analyzer-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; -} - -interface ChunkResult { - text: string; - usage: AnalyzeTextUsage; -} - -async function analyzeChunk( - ctx: ActionCtx, - agent: Agent, - text: string, - userInput: string, - chunkIndex?: number, - totalChunks?: number, - maxResponseChars?: number, -): Promise { - const chunkInfo = - totalChunks && totalChunks > 1 - ? `\n\n[Processing chunk ${(chunkIndex ?? 0) + 1} of ${totalChunks}]` - : ''; - - // Dynamic limit based on chunk count, or use full budget for single chunk - const charLimit = maxResponseChars ?? MAX_FINAL_RESPONSE_CHARS; - - const prompt = `User Question: ${userInput}${chunkInfo} - -Text Content: ---- -${text} ---- - -Please analyze the text above and answer the user's question. -IMPORTANT: Keep your response under ${charLimit} characters. Be concise and focus on key findings.`; - - const result = await agent.generateText( - ctx, - { userId: generateEphemeralUserId() }, - { prompt }, - { storageOptions: { saveMessages: 'none' } }, - ); - - const inputTokens = result.usage?.inputTokens ?? 0; - const outputTokens = result.usage?.outputTokens ?? 0; - - return { - text: result.text || '', - usage: { - inputTokens, - outputTokens, - totalTokens: inputTokens + outputTokens, - }, - }; -} - -async function aggregateChunkResults( - ctx: ActionCtx, - agent: Agent, - chunkResults: string[], - userInput: string, -): Promise { - if (chunkResults.length <= 1) { - return { - text: chunkResults[0] ?? '', - usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, - }; - } - - const combinedResults = chunkResults - .map((r, i) => `[Chunk ${i + 1} Analysis]\n${r}`) - .join('\n\n---\n\n'); - - const prompt = `The following are analysis results from different parts of a large text file. -User's original question: ${userInput} - -Analysis Results: -${combinedResults} - -Please synthesize these results into a coherent, comprehensive answer to the user's question. -Remove any redundancy and present the key findings clearly. -IMPORTANT: Keep your final response under ${MAX_FINAL_RESPONSE_CHARS} characters. Prioritize the most important information.`; - - try { - const result = await agent.generateText( - ctx, - { userId: generateEphemeralUserId() }, - { prompt }, - { storageOptions: { saveMessages: 'none' } }, - ); - - const inputTokens = result.usage?.inputTokens ?? 0; - const outputTokens = result.usage?.outputTokens ?? 0; - - return { - text: result.text || '', - usage: { - inputTokens, - outputTokens, - totalTokens: inputTokens + outputTokens, - }, - }; - } catch (error) { - debugLog('aggregateChunkResults error', { - error: error instanceof Error ? error.message : String(error), - }); - throw error; - } -} - -/** - * Analyze text file content using fast model. - * Uses ctx.storage.get() for direct Convex storage access (like analyze_image.ts). - * For large files, splits into chunks and processes each with the user's question. - * Uses Agent framework with saveMessages: 'none' to avoid creating visible thread messages. - */ -export async function analyzeTextContent( - ctx: ActionCtx, - params: AnalyzeTextParams, -): Promise { - const { fileId, filename, userInput, model } = params; - - debugLog('analyzeTextContent starting', { - fileId, - filename, - userInput: - userInput.length > 50 ? userInput.slice(0, 50) + '...' : userInput, - }); - - try { - // Get the text file blob from storage (like analyze_image.ts) - const textBlob = await ctx.storage.get(toId<'_storage'>(fileId)); - if (!textBlob) { - throw new Error(`Text file not found in storage: ${fileId}`); - } - - debugLog('analyzeTextContent got blob', { size: textBlob.size }); - - // Check file size limit - if (textBlob.size > MAX_TEXT_BYTES) { - const sizeMB = (textBlob.size / (1024 * 1024)).toFixed(2); - const maxMB = (MAX_TEXT_BYTES / (1024 * 1024)).toFixed(0); - return { - success: false, - result: '', - charCount: 0, - lineCount: 0, - encoding: 'unknown', - chunked: false, - error: `Text file is too large (${sizeMB}MB). Please upload a file smaller than ${maxMB}MB.`, - }; - } - - const buffer = await textBlob.arrayBuffer(); - debugLog('analyzeTextContent loaded', { bytes: buffer.byteLength }); - - const { text, encoding } = decodeWithEncoding(buffer); - - if (isBinaryContent(text)) { - return { - success: false, - result: '', - charCount: 0, - lineCount: 0, - encoding, - chunked: false, - error: - 'The file appears to be binary, not a text-based file. Please upload a valid text file (.txt, .md, .js, .ts, .json, .csv, .log, etc.).', - }; - } - - const charCount = text.length; - const lineCount = text.split('\n').length; - - debugLog('analyzeTextContent decoded', { charCount, lineCount, encoding }); - - const agent = createTextAnalysisAgent(params.languageModel); - - // For smaller content, process in one pass - if (charCount <= LLM_CHUNK_SIZE) { - const chunkResult = await analyzeChunk(ctx, agent, text, userInput); - - return { - success: true, - result: chunkResult.text, - charCount, - lineCount, - encoding, - chunked: false, - model, - usage: chunkResult.usage, - }; - } - - // For larger content, split into chunks and process with controlled concurrency - const chunks = splitIntoChunks(text, LLM_CHUNK_SIZE); - - // Dynamic per-chunk output limit: divide total budget by chunk count - const perChunkMaxChars = Math.floor( - MAX_TOTAL_CHUNK_OUTPUT_CHARS / chunks.length, - ); - - debugLog('analyzeTextContent chunking', { - chunkCount: chunks.length, - chunkSizes: chunks.map((c) => c.length), - perChunkMaxChars, - concurrency: MAX_CONCURRENT_CHUNKS, - }); - - // Process chunks with controlled concurrency to avoid rate limiting - const startTime = Date.now(); - const chunkResults = await mapWithConcurrency( - chunks, - async (chunk, i) => { - debugLog('analyzeTextContent processing chunk', { - chunk: `${i + 1}/${chunks.length}`, - chunkSize: chunk.length, - }); - const result = await analyzeChunk( - ctx, - agent, - chunk, - userInput, - i, - chunks.length, - perChunkMaxChars, - ); - debugLog('analyzeTextContent chunk completed', { - chunk: `${i + 1}/${chunks.length}`, - resultLength: result.text.length, - elapsedMs: Date.now() - startTime, - }); - return result; - }, - MAX_CONCURRENT_CHUNKS, - ); - debugLog('analyzeTextContent all chunks completed', { - chunkCount: chunkResults.length, - totalElapsedMs: Date.now() - startTime, - }); - - // Accumulate usage from all chunks - const totalUsage: AnalyzeTextUsage = { - inputTokens: 0, - outputTokens: 0, - totalTokens: 0, - }; - for (const cr of chunkResults) { - totalUsage.inputTokens += cr.usage.inputTokens; - totalUsage.outputTokens += cr.usage.outputTokens; - totalUsage.totalTokens += cr.usage.totalTokens; - } - - debugLog('analyzeTextContent aggregating results', { - chunkCount: chunkResults.length, - }); - const aggregationResult = await aggregateChunkResults( - ctx, - agent, - chunkResults.map((cr) => cr.text), - userInput, - ); - debugLog('analyzeTextContent aggregation completed', { - resultLength: aggregationResult.text.length, - }); - - // Add aggregation usage - totalUsage.inputTokens += aggregationResult.usage.inputTokens; - totalUsage.outputTokens += aggregationResult.usage.outputTokens; - totalUsage.totalTokens += aggregationResult.usage.totalTokens; - - return { - success: true, - result: aggregationResult.text, - charCount, - lineCount, - encoding, - chunked: true, - chunkCount: chunks.length, - model, - usage: totalUsage, - }; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - debugLog('analyzeTextContent error', { filename, error: errorMessage }); - - return { - success: false, - result: '', - charCount: 0, - lineCount: 0, - encoding: 'unknown', - chunked: false, - error: errorMessage, - }; - } -} diff --git a/services/platform/convex/agent_tools/files/helpers/parse_file.ts b/services/platform/convex/agent_tools/files/helpers/parse_file.ts deleted file mode 100644 index 62a388c47b..0000000000 --- a/services/platform/convex/agent_tools/files/helpers/parse_file.ts +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Shared file parsing helper for PDF, DOCX, and PPTX tools. - * Gets file from Convex storage and sends it to the crawler service for text extraction. - * Uses ctx.storage.get() for direct Convex storage access (like image_tool and text_tool). - */ - -import { getParseEndpoint } from '../../../../lib/shared/file-types'; -import { fetchJson } from '../../../../lib/utils/type-cast-helpers'; -import type { ActionCtx } from '../../../_generated/server'; -import { createDebugLog } from '../../../lib/debug_log'; -import { toId } from '../../../lib/type_cast_helpers'; -import { getCrawlerServiceUrl } from '../../web/helpers/get_crawler_service_url'; -import { resolveFileName } from './resolve_file_name'; - -const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); - -export interface ParseFileResult { - success: boolean; - filename: string; - file_type?: string; - full_text?: string; - page_count?: number; - slide_count?: number; - paragraph_count?: number; - metadata?: { - title?: string; - author?: string; - subject?: string; - }; - usage?: { - inputTokens: number; - outputTokens: number; - totalTokens: number; - durationMs?: number; - model?: string; - }; - error?: string; -} - -/** - * Parse a file by getting it from Convex storage and sending it to the crawler service. - * @param ctx - Action context for storage access - * @param fileId - Convex storage ID of the file - * @param filename - Original filename with extension (optional, resolved from fileMetadata if not provided) - * @param toolName - Name of the calling tool (for logging) - * @param userInput - Optional user question/instruction to guide parsing - * @returns ParseFileResult with extracted text and metadata - */ -export async function parseFile( - ctx: ActionCtx, - fileId: string, - filename: string | undefined, - toolName: string, - userInput?: string, - model?: string, -): Promise { - const resolvedFilename = await resolveFileName(ctx, fileId, filename); - - debugLog(`tool:${toolName} parse start`, { - fileId, - filename: resolvedFilename, - }); - - try { - // Get the file blob from Convex storage (like image_tool and text_tool) - const fileBlob = await ctx.storage.get(toId<'_storage'>(fileId)); - if (!fileBlob) { - throw new Error(`File not found in storage: ${fileId}`); - } - - debugLog(`tool:${toolName} parse got blob`, { - filename: resolvedFilename, - size: fileBlob.size, - type: fileBlob.type, - }); - - const crawlerUrl = getCrawlerServiceUrl(); - const endpointPath = getParseEndpoint(resolvedFilename); - const apiUrl = `${crawlerUrl}${endpointPath}`; - - // Create FormData and upload to crawler service - const formData = new FormData(); - formData.append('file', fileBlob, resolvedFilename); - if (userInput) { - formData.append('user_input', userInput); - } - if (model) { - formData.append('model', model); - } - - debugLog(`tool:${toolName} parse uploading to crawler`, { - filename: resolvedFilename, - size: fileBlob.size, - endpoint: endpointPath, - hasUserInput: !!userInput, - model: model ?? null, - }); - - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), 300_000); - - const response = await fetch(apiUrl, { - method: 'POST', - body: formData, - signal: controller.signal, - }); - - clearTimeout(timeoutId); - - if (!response.ok) { - const errorText = await response.text().catch(() => ''); - throw new Error(`Crawler service error: ${response.status} ${errorText}`); - } - - interface RawCrawlerUsage { - input_tokens?: number; - output_tokens?: number; - total_tokens?: number; - duration_ms?: number; - model?: string; - } - - const raw = await fetchJson( - response, - ); - - // Remap snake_case usage from crawler to camelCase - const result: ParseFileResult = { ...raw }; - if (raw.usage) { - result.usage = { - inputTokens: raw.usage.input_tokens ?? 0, - outputTokens: raw.usage.output_tokens ?? 0, - totalTokens: raw.usage.total_tokens ?? 0, - durationMs: raw.usage.duration_ms, - model: raw.usage.model, - }; - } - - debugLog(`tool:${toolName} parse success`, { - filename: result.filename, - success: result.success, - textLength: result.full_text?.length ?? 0, - }); - - return result; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - console.error(`[tool:${toolName} parse] error`, { - filename: resolvedFilename, - error: message, - }); - return { - success: false, - filename: resolvedFilename, - error: message, - }; - } -} diff --git a/services/platform/convex/agent_tools/files/helpers/resolve_file_name.ts b/services/platform/convex/agent_tools/files/helpers/resolve_file_name.ts deleted file mode 100644 index 1fcd1b3735..0000000000 --- a/services/platform/convex/agent_tools/files/helpers/resolve_file_name.ts +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Resolves a filename for a given storage ID. - * If a filename is provided, returns it directly. - * Otherwise, looks up the filename from the fileMetadata table. - */ - -import { internal } from '../../../_generated/api'; -import type { ActionCtx } from '../../../_generated/server'; -import { toId } from '../../../lib/type_cast_helpers'; - -export async function resolveFileName( - ctx: ActionCtx, - fileId: string, - providedFilename?: string, -): Promise { - if (providedFilename) { - return providedFilename; - } - - const metadata = await ctx.runQuery( - internal.file_metadata.internal_queries.getByStorageId, - { storageId: toId<'_storage'>(fileId) }, - ); - - if (!metadata) { - throw new Error( - `Could not resolve filename for fileId '${fileId}'. No fileMetadata record found. Please provide filename explicitly.`, - ); - } - - return metadata.fileName; -} diff --git a/services/platform/convex/agent_tools/files/internal_actions.ts b/services/platform/convex/agent_tools/files/internal_actions.ts index 9cbc6d0bdd..e14426f6ca 100644 --- a/services/platform/convex/agent_tools/files/internal_actions.ts +++ b/services/platform/convex/agent_tools/files/internal_actions.ts @@ -12,59 +12,6 @@ import { analyzeImage as analyzeImageHelper, type AnalyzeImageResult, } from './helpers/analyze_image'; -import { - parseFile as parseFileHelper, - type ParseFileResult, -} from './helpers/parse_file'; - -/** - * Internal action for parsing files (PDF, DOCX, PPTX). - * Wrapped for caching - same fileId/filename should return same result. - */ -export const parseFileUncached = internalAction({ - args: { - fileId: v.string(), - filename: v.string(), - toolName: v.string(), - model: v.optional(v.string()), - }, - returns: v.object({ - success: v.boolean(), - filename: v.string(), - file_type: v.optional(v.string()), - full_text: v.optional(v.string()), - page_count: v.optional(v.number()), - slide_count: v.optional(v.number()), - paragraph_count: v.optional(v.number()), - metadata: v.optional( - v.object({ - title: v.optional(v.string()), - author: v.optional(v.string()), - subject: v.optional(v.string()), - }), - ), - usage: v.optional( - v.object({ - inputTokens: v.number(), - outputTokens: v.number(), - totalTokens: v.number(), - durationMs: v.optional(v.number()), - model: v.optional(v.string()), - }), - ), - error: v.optional(v.string()), - }), - handler: async (ctx, args): Promise => { - return await parseFileHelper( - ctx, - args.fileId, - args.filename, - args.toolName, - undefined, - args.model, - ); - }, -}); /** * Internal action for analyzing images with vision model. diff --git a/services/platform/convex/agent_tools/files/pdf_tool.ts b/services/platform/convex/agent_tools/files/pdf_tool.ts index 3b4caea6ee..dda8765aba 100644 --- a/services/platform/convex/agent_tools/files/pdf_tool.ts +++ b/services/platform/convex/agent_tools/files/pdf_tool.ts @@ -1,6 +1,5 @@ /** Convex Tool: PDF * Generate PDF documents from Markdown/HTML/URL via the crawler service. - * Parse PDF documents to extract text content. */ import type { ToolCtx } from '@convex-dev/agent'; @@ -11,8 +10,6 @@ import { internal } from '../../_generated/api'; import { createDebugLog } from '../../lib/debug_log'; import type { ToolDefinition } from '../types'; import { appendFilePart } from './helpers/append_file_part'; -import { getAgentModelId } from './helpers/get_agent_model'; -import { parseFile, type ParseFileResult } from './helpers/parse_file'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); @@ -27,17 +24,17 @@ interface GeneratePdfResult { size: number; } -type ParsePdfResult = { operation: 'parse' } & ParseFileResult; - -type PdfResult = GeneratePdfResult | ParsePdfResult; - export const pdfTool = { name: 'pdf' as const, tool: createTool({ - description: `PDF tool for generating, downloading, and parsing PDF documents. + description: `PDF tool for generating and downloading PDF documents. IMPORTANT: Only call the "generate" operation when the user explicitly requests creating or exporting a PDF file. Do NOT proactively generate PDFs unless the user specifically asks for this format. +TO READ PDF FILE CONTENT: Do NOT use this tool. Instead use the rag_search tool: +• To get the full content of a PDF file: use rag_search with operation='retrieve' and the fileId +• To search for specific information across PDF files: use rag_search with operation='search' + OPERATIONS: 1. generate - Generate a PDF from Markdown/HTML, or download/capture a PDF from a URL @@ -58,95 +55,53 @@ OPERATIONS: • Use this to download and store existing PDF files from external URLs • The returned fileStorageId can be passed to document_write to save to a folder in the documents hub -2. parse - Extract text content from an existing PDF file - USE THIS when a user uploads a PDF and you need to read its content. - Parameters: - - fileId: **REQUIRED** - Convex storage ID (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj") - - filename: Optional — original filename (e.g., "report.pdf"). Auto-resolved from file metadata if omitted. - - user_input: **REQUIRED** - The user's question or instruction about the PDF - Returns: { success, full_text, page_count, metadata } - EXAMPLES: • Generate: { "operation": "generate", "fileName": "report", "sourceType": "markdown", "content": "# Report\\n..." } • Download existing PDF: { "operation": "generate", "fileName": "report", "sourceType": "url", "content": "https://example.com/report.pdf" } -• Parse: { "operation": "parse", "fileId": "kg2bazp7...", "filename": "report.pdf", "user_input": "Summarize the key findings" } AFTER GENERATING: Check the downloadUrl in the result: - If it says "[file card shown in chat]": the file is already visible as a download card. Do NOT mention downloading, do NOT include a link, and do NOT say "you can download it" — the card handles this. - If it contains an actual URL: no download card was shown. You MUST include the URL as a clickable markdown link so the user can download the file. To also save the file to a folder in the documents hub, call document_write with the returned fileStorageId and the desired folderPath. `, - inputSchema: z.discriminatedUnion('operation', [ - z.object({ - operation: z.literal('generate'), - fileName: z - .string() - .describe('Base name for the PDF file (without extension)'), - sourceType: z - .enum(['markdown', 'html', 'url']) - .describe('Type of source content'), - content: z - .string() - .describe('Markdown text, HTML content, or URL to capture'), - pdfOptions: z - .object({ - format: z.string().optional(), - landscape: z.boolean().optional(), - marginTop: z.string().optional(), - marginBottom: z.string().optional(), - marginLeft: z.string().optional(), - marginRight: z.string().optional(), - printBackground: z.boolean().optional(), - }) - .optional() - .describe('Advanced PDF options'), - urlOptions: z - .object({ - waitUntil: z - .enum(['load', 'domcontentloaded', 'networkidle', 'commit']) - .optional(), - }) - .optional() - .describe('Options for URL capture'), - extraCss: z.string().optional().describe('Additional CSS to inject'), - wrapInTemplate: z - .boolean() - .optional() - .describe('Whether to wrap in HTML template'), - }), - z.object({ - operation: z.literal('parse'), - fileId: z - .string() - .describe( - "Convex storage ID (e.g., 'kg2bazp7fbgt9srq63knfagjrd7yfenj'). Get this from the file attachment context.", - ), - filename: z - .string() - .optional() - .describe( - "Original filename (e.g., 'report.pdf'). Optional — auto-resolved from file metadata if omitted.", - ), - user_input: z - .string() - .describe("The user's question or instruction about the PDF content"), - }), - ]), - execute: async (ctx: ToolCtx, args): Promise => { - if (args.operation === 'parse') { - const model = getAgentModelId(ctx); - const result = await parseFile( - ctx, - args.fileId, - args.filename, - 'pdf', - args.user_input, - model, - ); - return { operation: 'parse', ...result }; - } - - // operation === 'generate' + inputSchema: z.object({ + operation: z.literal('generate'), + fileName: z + .string() + .describe('Base name for the PDF file (without extension)'), + sourceType: z + .enum(['markdown', 'html', 'url']) + .describe('Type of source content'), + content: z + .string() + .describe('Markdown text, HTML content, or URL to capture'), + pdfOptions: z + .object({ + format: z.string().optional(), + landscape: z.boolean().optional(), + marginTop: z.string().optional(), + marginBottom: z.string().optional(), + marginLeft: z.string().optional(), + marginRight: z.string().optional(), + printBackground: z.boolean().optional(), + }) + .optional() + .describe('Advanced PDF options'), + urlOptions: z + .object({ + waitUntil: z + .enum(['load', 'domcontentloaded', 'networkidle', 'commit']) + .optional(), + }) + .optional() + .describe('Options for URL capture'), + extraCss: z.string().optional().describe('Additional CSS to inject'), + wrapInTemplate: z + .boolean() + .optional() + .describe('Whether to wrap in HTML template'), + }), + execute: async (ctx: ToolCtx, args): Promise => { const { organizationId } = ctx; if (!organizationId) { throw new Error('organizationId is required to generate a PDF'); diff --git a/services/platform/convex/agent_tools/files/pptx_tool.ts b/services/platform/convex/agent_tools/files/pptx_tool.ts index 03c4519748..a4c48618c4 100644 --- a/services/platform/convex/agent_tools/files/pptx_tool.ts +++ b/services/platform/convex/agent_tools/files/pptx_tool.ts @@ -1,7 +1,8 @@ /** - * Convex Tool: PPTX + * Convex Tool: PPTX (Presentation) * - * PPTX operations for agents: list templates, generate presentations, and parse existing files. + * Generate HTML slide presentations. The LLM produces the full HTML content + * (using reveal.js or any other approach) and this tool stores it as a file. */ import type { ToolCtx } from '@convex-dev/agent'; @@ -9,279 +10,72 @@ import { createTool } from '@convex-dev/agent'; import { z } from 'zod/v4'; import { internal } from '../../_generated/api'; -import type { ListDocumentsByExtensionResult } from '../../documents/types'; import { createDebugLog } from '../../lib/debug_log'; -import { toId } from '../../lib/type_cast_helpers'; import type { ToolDefinition } from '../types'; import { appendFilePart } from './helpers/append_file_part'; -import { getAgentModelId } from './helpers/get_agent_model'; -import { parseFile, type ParseFileResult } from './helpers/parse_file'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); -// Table data schema for generation -const tableDataSchema = z.object({ - headers: z.array(z.string()).describe('Column headers'), - rows: z.array(z.array(z.string())).describe('Table data rows'), -}); - -// Slide content schema for generation -const slideContentSchema = z.object({ - title: z.string().optional().describe('Slide title'), - subtitle: z.string().optional().describe('Slide subtitle'), - textContent: z.array(z.string()).optional().describe('Text paragraphs'), - bulletPoints: z.array(z.string()).optional().describe('Bullet point items'), - tables: z - .array(tableDataSchema) - .optional() - .describe('Tables to add to the slide'), -}); - -// Branding schema -const brandingSchema = z.object({ - slideWidth: z.number().optional().describe('Slide width in inches'), - slideHeight: z.number().optional().describe('Slide height in inches'), - titleFontName: z - .string() - .optional() - .describe('Font name for titles (e.g., "Arial")'), - bodyFontName: z - .string() - .optional() - .describe('Font name for body text (e.g., "Calibri")'), - titleFontSize: z - .number() - .optional() - .describe('Font size for titles in points'), - bodyFontSize: z - .number() - .optional() - .describe('Font size for body text in points'), - primaryColor: z - .string() - .optional() - .describe('Primary color as hex (e.g., "#003366")'), - secondaryColor: z.string().optional().describe('Secondary color as hex'), - accentColor: z.string().optional().describe('Accent color as hex'), -}); +interface GeneratePresentationResult { + operation: 'generate'; + success: boolean; + fileStorageId: string; + downloadUrl: string; + fileName: string; + contentType: string; + extension: string; + size: number; +} const pptxArgs = z.discriminatedUnion('operation', [ - z.object({ - operation: z.literal('list_templates'), - limit: z - .number() - .optional() - .describe('Maximum number of templates to return (default: 50)'), - }), z.object({ operation: z.literal('generate'), - templateStorageId: z - .string() - .optional() - .describe( - 'Convex storage ID of the PPTX template. The template is used as base, preserving all styling, backgrounds, and decorative elements.', - ), fileName: z .string() - .describe('Base name for the PPTX file (without extension)'), - slidesContent: z - .array(slideContentSchema) - .describe('Content for each slide in the presentation'), - branding: brandingSchema - .optional() - .describe('Optional additional branding overrides'), - }), - z.object({ - operation: z.literal('parse'), - fileId: z - .string() - .describe( - "Convex storage ID (e.g., 'kg2bazp7fbgt9srq63knfagjrd7yfenj'). Get this from the file attachment context.", - ), - filename: z - .string() - .optional() - .describe( - "Original filename (e.g., 'presentation.pptx'). Optional — auto-resolved from file metadata if omitted.", - ), - user_input: z + .describe('Base name for the presentation file (without extension)'), + html: z .string() .describe( - "The user's question or instruction about the presentation content", + 'Complete HTML document for the presentation. Must be a self-contained HTML file that can be opened directly in a browser.', ), }), ]); -// Result types -interface ListTemplatesResult { - operation: 'list_templates'; - success: boolean; - templates: Array<{ - fileId: string; - title: string; - createdAt: number; - }>; - totalCount: number; - message: string; -} - -interface GenerateResult { - operation: 'generate'; - success: boolean; - fileStorageId: string; - downloadUrl: string; - fileName: string; - contentType: string; - size: number; - error?: string; -} - -type ParsePptxResult = { operation: 'parse' } & ParseFileResult; - -type PptxResult = ListTemplatesResult | GenerateResult | ParsePptxResult; - export const pptxTool: ToolDefinition = { name: 'pptx', tool: createTool({ - description: `PowerPoint (PPTX) tool for listing templates, generating, and parsing presentations. + description: `Presentation tool for generating HTML slide decks. -IMPORTANT: Only call the "generate" operation when the user explicitly requests creating or exporting a PowerPoint/PPTX file. Do NOT proactively generate presentations unless the user specifically asks for this format. +IMPORTANT: Only call the "generate" operation when the user explicitly requests creating a presentation / slides / PPT. Do NOT proactively generate presentations unless the user specifically asks. -IMPORTANT WORKFLOW FOR GENERATING PPTX: -1. FIRST call list_templates to check if templates are available -2. If no templates found, tell the user to upload a .pptx template to the Knowledge Base (Documents page) — NOT in the chat. Include the link from the list_templates result. -3. Only call generate after you have a valid templateStorageId from list_templates +Do NOT mention templates — this tool does not use templates. Just generate the content directly. -OPERATIONS: +TO READ EXISTING PPTX FILE CONTENT: Do NOT use this tool. Instead use the rag_search tool: +• To get the full content of a PPTX file: use rag_search with operation='retrieve' and the fileId +• To search for specific information across PPTX files: use rag_search with operation='search' -1. list_templates - List all available PPTX templates - ALWAYS call this first before generate! - Returns: { templates, totalCount, message } - -2. generate - Generate a PPTX with your content - REQUIRES templateStorageId from list_templates - do NOT call without it! - Pass slidesContent with your content. Each slide can have: - - title, subtitle, textContent, bulletPoints, tables - The backend automatically selects the best layout based on content. +OPERATIONS: -3. parse - Extract text content from an existing PPTX file - USE THIS when a user uploads a PPTX and you need to read its content. +1. generate - Generate an HTML slide presentation Parameters: - - fileId: **REQUIRED** - Convex storage ID (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj") - - filename: Optional — original filename (e.g., "presentation.pptx"). Auto-resolved from file metadata if omitted. - - user_input: **REQUIRED** - The user's question or instruction about the presentation - Returns: { success, full_text, slide_count, metadata } - -EXAMPLES: -• List templates: { "operation": "list_templates" } -• Generate: { "operation": "generate", "templateStorageId": "kg...", "fileName": "Report", "slidesContent": [...] } -• Parse: { "operation": "parse", "fileId": "kg2bazp7...", "filename": "presentation.pptx", "user_input": "Summarize the key slides" } - -SLIDE CONTENT EXAMPLES: -- Title slide: { "title": "Welcome", "subtitle": "Introduction" } -- Content slide: { "title": "Agenda", "bulletPoints": ["Point 1", "Point 2"] } -- With table: { "title": "Data", "tables": [{"headers": ["A", "B"], "rows": [["1", "2"]]}] } + - fileName: Base name for the file (without extension) + - html: A complete, self-contained HTML document for the presentation. + Use reveal.js (loaded from CDN: https://cdn.jsdelivr.net/npm/reveal.js@5) as the slide framework. + You have full control over styling, layout, colors, animations, and themes. + The HTML must work when opened directly in a browser with no server needed. + Returns: { success, fileStorageId, downloadUrl, fileName, contentType, size } AFTER GENERATING: Check the downloadUrl in the result: - If it says "[file card shown in chat]": the file is already visible as a download card. Do NOT mention downloading, do NOT include a link, and do NOT say "you can download it" — the card handles this. - If it contains an actual URL: no download card was shown. You MUST include the URL as a clickable markdown link so the user can download the file. -To also save the file to a folder in the documents hub, call document_write with the returned fileStorageId and the desired folderPath.`, +To also save the file to a folder in the documents hub, call document_write with the returned fileStorageId and the desired folderPath. +`, inputSchema: pptxArgs, - execute: async (ctx: ToolCtx, args): Promise => { + execute: async ( + ctx: ToolCtx, + args, + ): Promise => { const { organizationId } = ctx; - - // Handle list_templates operation - if (args.operation === 'list_templates') { - if (!organizationId) { - return { - operation: 'list_templates', - success: false, - templates: [], - totalCount: 0, - message: - 'No organizationId in context - cannot list templates. This tool requires organizationId to be set.', - }; - } - - debugLog('tool:pptx list_templates start', { - organizationId, - limit: args.limit, - }); - - try { - const documents: ListDocumentsByExtensionResult = await ctx.runQuery( - internal.documents.internal_queries.listDocumentsByExtension, - { - organizationId, - extension: 'pptx', - limit: args.limit, - }, - ); - - const templates = documents - .filter( - (doc): doc is typeof doc & { fileId: string } => !!doc.fileId, - ) - .map((doc) => ({ - fileId: doc.fileId, - title: doc.title ?? 'Untitled Template', - createdAt: doc._creationTime, - })); - - debugLog('tool:pptx list_templates success', { - totalCount: templates.length, - }); - - const siteUrl = process.env.SITE_URL || ''; - const basePath = process.env.BASE_PATH || ''; - const knowledgeUrl = `${siteUrl}${basePath}/dashboard/${organizationId}/documents`; - - return { - operation: 'list_templates', - success: true, - templates, - totalCount: templates.length, - message: - templates.length > 0 - ? `Found ${templates.length} PPTX template(s). Use the fileId as templateStorageId for generate operations.` - : `No PPTX templates found. The user must upload a .pptx template file to the Knowledge Base first — uploading in the chat will NOT work as a template. Direct the user to: ${knowledgeUrl} . Do NOT attempt to call generate without a template.`, - }; - } catch (error) { - console.error('[tool:pptx list_templates] error', { - error: error instanceof Error ? error.message : String(error), - }); - throw error; - } - } - - if (args.operation === 'parse') { - const model = getAgentModelId(ctx); - const result = await parseFile( - ctx, - args.fileId, - args.filename, - 'pptx', - args.user_input, - model, - ); - return { operation: 'parse', ...result }; - } - - // operation === 'generate' - if (!args.templateStorageId) { - return { - operation: 'generate', - success: false, - fileStorageId: '', - downloadUrl: '', - fileName: args.fileName, - contentType: '', - size: 0, - error: - 'templateStorageId is required. Call list_templates first to get available templates. If no templates exist, the user must upload a .pptx template to the Knowledge Base (Documents page) — not in chat.', - }; - } - if (!organizationId) { throw new Error( 'organizationId is required to generate a presentation', @@ -290,20 +84,17 @@ To also save the file to a folder in the documents hub, call document_write with debugLog('tool:pptx generate start', { fileName: args.fileName, - slidesCount: args.slidesContent.length, - hasBranding: !!args.branding, - hasTemplate: !!args.templateStorageId, }); try { const result = await ctx.runAction( - internal.documents.internal_actions.generatePptx, + internal.documents.internal_actions.storeRawContent, { organizationId, fileName: args.fileName, - slidesContent: args.slidesContent, - branding: args.branding, - templateStorageId: toId<'_storage'>(args.templateStorageId), + content: args.html, + contentType: 'text/html', + extension: 'html', }, ); @@ -325,7 +116,7 @@ To also save the file to a folder in the documents hub, call document_write with downloadUrl: cardAppended ? '[file card shown in chat]' : result.downloadUrl, - } as GenerateResult; + } as GeneratePresentationResult; } catch (error) { console.error('[tool:pptx generate] error', { fileName: args.fileName, diff --git a/services/platform/convex/agent_tools/files/text_tool.ts b/services/platform/convex/agent_tools/files/text_tool.ts index ebafcb5041..1753e70ea5 100644 --- a/services/platform/convex/agent_tools/files/text_tool.ts +++ b/services/platform/convex/agent_tools/files/text_tool.ts @@ -1,8 +1,6 @@ /** Convex Tool: Text - * Parse text-based files and analyze content using fast model. * Generate plain text files from content. * Supports all text formats: .txt, .md, .js, .ts, .json, .csv, .log, code files, and more. - * Handles various encodings and large files via chunked processing. * Uses ctx.storage.get() for direct Convex storage access (like image_tool). */ @@ -14,28 +12,10 @@ import { internal } from '../../_generated/api'; import { createDebugLog } from '../../lib/debug_log'; import { buildDownloadUrl } from '../../lib/helpers/public_storage_url'; import type { ToolDefinition } from '../types'; -import { analyzeTextContent } from './helpers/analyze_text'; import { appendFilePart } from './helpers/append_file_part'; -import { getAgentModelId } from './helpers/get_agent_model'; -import { resolveFileName } from './helpers/resolve_file_name'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); -interface TextParseResult { - operation: 'parse'; - success: boolean; - result: string; - filename: string; - char_count: number; - line_count: number; - encoding: string; - chunked: boolean; - chunk_count?: number; - model?: string; - usage?: { inputTokens: number; outputTokens: number; totalTokens: number }; - error?: string; -} - interface TextGenerateResult { operation: 'generate'; success: boolean; @@ -47,26 +27,7 @@ interface TextGenerateResult { error?: string; } -type TextResult = TextParseResult | TextGenerateResult; - const textArgs = z.discriminatedUnion('operation', [ - z.object({ - operation: z.literal('parse'), - fileId: z - .string() - .describe( - "Convex storage ID of the file (e.g., 'kg2bazp7fbgt9srq63knfagjrd7yfenj'). Get this from the file attachment context.", - ), - filename: z - .string() - .optional() - .describe( - "Original filename (e.g., 'data.txt', 'script.js'). Optional — auto-resolved from file metadata if omitted.", - ), - user_input: z - .string() - .describe("The user's question or instruction about the file"), - }), z.object({ operation: z.literal('generate'), filename: z @@ -79,22 +40,13 @@ const textArgs = z.discriminatedUnion('operation', [ export const textTool = { name: 'text' as const, tool: createTool({ - description: `Text file tool for parsing, analyzing, and generating text-based files (.txt, .md, .js, .ts, .json, .csv, .log, and any other text format). + description: `Text file tool for generating text-based files (.txt, .md, .js, .ts, .json, .csv, .log, and any other text format). IMPORTANT: Only call the "generate" operation when the user explicitly requests creating or exporting a text file. Do NOT proactively generate text files unless the user specifically asks for this format. -OPERATIONS: -1. **parse** - Parse and analyze an uploaded text-based file -2. **generate** - Create a new text file from content - -**PARSE OPERATION** -Use when a user uploads any text-based file and asks to analyze its content. -Supports all text formats: plain text (.txt), markdown (.md), source code (.js, .ts, .py, etc.), config files (.json, .yaml, .toml), logs (.log), CSV, and more. -Parameters: -- operation: "parse" -- fileId: **REQUIRED** - Convex storage ID (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj") -- filename: Optional — original filename (e.g., "notes.txt", "app.js"). Auto-resolved from file metadata if omitted. -- user_input: The user's question or instruction +TO READ TEXT FILE CONTENT: Do NOT use this tool. Instead use the rag_search tool: +• To get the full content of a text file: use rag_search with operation='retrieve' and the fileId +• To search for specific information across text files: use rag_search with operation='search' **GENERATE OPERATION** Use when a user wants to create/export a text file. @@ -104,11 +56,9 @@ Parameters: - content: The text content to write EXAMPLES: -• Parse: { "operation": "parse", "fileId": "kg2...", "filename": "error.log", "user_input": "Find all errors" } -• Parse: { "operation": "parse", "fileId": "kg2...", "filename": "app.ts", "user_input": "Explain this code" } • Generate: { "operation": "generate", "filename": "report.md", "content": "# Report\\n\\nContent here..." } -Returns: { success, downloadUrl (for generate), result (for parse), char_count, line_count } +Returns: { success, downloadUrl, filename, char_count, line_count } AFTER GENERATING: Check the downloadUrl in the result: - If it says "[file card shown in chat]": the file is already visible as a download card. Do NOT mention downloading, do NOT include a link, and do NOT say "you can download it" — the card handles this. @@ -116,143 +66,72 @@ AFTER GENERATING: Check the downloadUrl in the result: To also save the file to a folder in the documents hub, call document_write with the returned fileStorageId and the desired folderPath. `, inputSchema: textArgs, - execute: async (ctx: ToolCtx, args): Promise => { - if (args.operation === 'generate') { - const { filename, content } = args; - - try { - debugLog('tool:text generate start', { - filename, - contentLength: content.length, - }); - const blob = new Blob([content], { - type: 'text/plain; charset=utf-8', - }); - const fileId = await ctx.storage.store(blob); - - await ctx.runMutation( - internal.file_metadata.internal_mutations.saveFileMetadata, - { - organizationId: ctx.organizationId ?? 'system', - storageId: fileId, - fileName: filename, - contentType: 'text/plain; charset=utf-8', - size: blob.size, - source: 'agent', - }, - ); + execute: async (ctx: ToolCtx, args): Promise => { + const { filename, content } = args; - const url = buildDownloadUrl(fileId, filename); - const lineCount = content.split('\n').length; - - debugLog('tool:text generate success', { - filename, - fileId, - charCount: content.length, - lineCount, - }); + try { + debugLog('tool:text generate start', { + filename, + contentLength: content.length, + }); + const blob = new Blob([content], { + type: 'text/plain; charset=utf-8', + }); + const fileId = await ctx.storage.store(blob); - const cardAppended = await appendFilePart(ctx, { + await ctx.runMutation( + internal.file_metadata.internal_mutations.saveFileMetadata, + { + organizationId: ctx.organizationId ?? 'system', + storageId: fileId, fileName: filename, - mimeType: 'text/plain; charset=utf-8', - downloadUrl: url, - }); - - return { - operation: 'generate', - success: true, - fileStorageId: fileId, - downloadUrl: cardAppended ? '[file card shown in chat]' : url, - filename, - char_count: content.length, - line_count: lineCount, - }; - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); - console.error('[tool:text generate] error', { - filename, - error: errorMessage, - }); - - return { - operation: 'generate', - success: false, - fileStorageId: '', - downloadUrl: '', - filename, - char_count: 0, - line_count: 0, - error: errorMessage, - }; - } - } + contentType: 'text/plain; charset=utf-8', + size: blob.size, + source: 'agent', + }, + ); - // operation === 'parse' - const { fileId, filename, user_input } = args; - const model = getAgentModelId(ctx); - const resolvedFilename = await resolveFileName(ctx, fileId, filename); + const url = buildDownloadUrl(fileId, filename); + const lineCount = content.split('\n').length; - debugLog('tool:text parse start', { - fileId, - filename: resolvedFilename, - model, - user_input: - user_input.length > 100 - ? user_input.slice(0, 100) + '...' - : user_input, - }); - - try { - const result = await analyzeTextContent(ctx, { + debugLog('tool:text generate success', { + filename, fileId, - filename: resolvedFilename, - userInput: user_input, - model, - // oxlint-disable-next-line typescript/no-non-null-assertion,typescript/no-unsafe-type-assertion -- ctx.agent is guaranteed non-null inside a tool execute callback - languageModel: ctx.agent!.options - .languageModel as import('@ai-sdk/provider').LanguageModelV3, + charCount: content.length, + lineCount, }); - debugLog('tool:text parse success', { - filename: resolvedFilename, - charCount: result.charCount, - lineCount: result.lineCount, - chunked: result.chunked, + const cardAppended = await appendFilePart(ctx, { + fileName: filename, + mimeType: 'text/plain; charset=utf-8', + downloadUrl: url, }); return { - operation: 'parse', - success: result.success, - result: result.result, - filename: resolvedFilename, - char_count: result.charCount, - line_count: result.lineCount, - encoding: result.encoding, - chunked: result.chunked, - chunk_count: result.chunkCount, - model: result.model, - usage: result.usage, - error: result.error, + operation: 'generate', + success: true, + fileStorageId: fileId, + downloadUrl: cardAppended ? '[file card shown in chat]' : url, + filename, + char_count: content.length, + line_count: lineCount, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); - console.error('[tool:text parse] error', { - fileId, - filename: resolvedFilename, + console.error('[tool:text generate] error', { + filename, error: errorMessage, }); return { - operation: 'parse', + operation: 'generate', success: false, - result: '', - filename: resolvedFilename, + fileStorageId: '', + downloadUrl: '', + filename, char_count: 0, line_count: 0, - encoding: 'unknown', - chunked: false, error: errorMessage, }; } diff --git a/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts b/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts new file mode 100644 index 0000000000..2c2900bd92 --- /dev/null +++ b/services/platform/convex/agent_tools/rag/helpers/fetch_document_chunks.ts @@ -0,0 +1,68 @@ +import { fetchJson } from '../../../../lib/utils/type-cast-helpers'; + +const MAX_CHUNK_WINDOW = 200; +/** Stop fetching once accumulated content exceeds this size (~15K tokens). */ +const MAX_TOTAL_CHARS = 60_000; + +interface DocumentContentResponse { + file_id: string; + title: string | null; + content: string; + chunk_range: { start: number; end: number }; + total_chunks: number; + total_chars: number; + chunks: Array<{ index: number; content: string }> | null; +} + +export interface DocumentChunksResult { + documentId: string; + title: string | null; + chunks: Array<{ index: number; content: string }>; + totalChunks: number; +} + +export async function fetchDocumentChunks( + serviceUrl: string, + fileId: string, +): Promise { + const allChunks: Array<{ index: number; content: string }> = []; + let totalChunks = 0; + let documentId = ''; + let title: string | null = null; + let chunkStart = 1; + + while (true) { + const chunkEnd = chunkStart + MAX_CHUNK_WINDOW - 1; + const url = `${serviceUrl}/api/v1/documents/${encodeURIComponent(fileId)}/content?return_chunks=true&chunk_start=${chunkStart}&chunk_end=${chunkEnd}`; + + const response = await fetch(url); + + if (!response.ok) { + const errorText = await response.text().catch(() => ''); + throw new Error( + `RAG get_chunks error (${response.status}): ${errorText || 'Unknown error'}`, + ); + } + + const result = await fetchJson(response); + documentId = result.file_id; + title = result.title; + totalChunks = result.total_chunks; + + if (result.chunks) { + allChunks.push(...result.chunks); + } + + const totalCharsNow = allChunks.reduce((s, c) => s + c.content.length, 0); + if ( + result.chunk_range.end >= totalChunks || + totalCharsNow >= MAX_TOTAL_CHARS + ) { + break; + } + + chunkStart = result.chunk_range.end + 1; + } + + return { documentId, title, chunks: allChunks, totalChunks }; +} diff --git a/services/platform/convex/agent_tools/rag/rag_search_tool.ts b/services/platform/convex/agent_tools/rag/rag_search_tool.ts index 792f5e4320..045108383c 100644 --- a/services/platform/convex/agent_tools/rag/rag_search_tool.ts +++ b/services/platform/convex/agent_tools/rag/rag_search_tool.ts @@ -87,7 +87,7 @@ const ragToolArgs = z.discriminatedUnion('operation', [ .array(z.string()) .optional() .describe( - 'Specific file IDs to search within. When provided, only these files are searched (skips automatic file resolution). Use this when you know exactly which files to search.', + 'Specific file IDs to search within. When provided, only these files are searched (skips automatic file resolution). IMPORTANT: If the user message contains file IDs (from uploaded attachments), pass them here first to prioritize those files. Retry without fileIds for a broader search if no relevant results are found.', ), topK: z .number() @@ -113,26 +113,56 @@ const ragToolArgs = z.discriminatedUnion('operation', [ 'Pagination cursor from previous response. Pass the exact cursor value returned — do not fabricate.', ), }), + z.object({ + operation: z.literal('retrieve'), + fileId: z + .string() + .describe( + 'File ID of the document to retrieve content from (e.g., "kg2bazp7fbgt9srq63knfagjrd7yfenj")', + ), + chunkStart: z + .number() + .int() + .min(1) + .optional() + .describe('Start chunk index (1-based, default 1)'), + chunkEnd: z + .number() + .int() + .min(1) + .optional() + .describe('End chunk index (inclusive, default chunkStart + 9)'), + }), ]); export const ragSearchTool = { name: 'rag_search' as const, tool: createTool({ - description: `Knowledge base tool for searching content and listing indexed documents. + description: `Knowledge base tool for searching, retrieving, and listing indexed documents. OPERATIONS: • 'search': Search the knowledge base for relevant document excerpts using hybrid search (BM25 + vector similarity). Returns numbered excerpts with relevance scores. -• 'list_indexed': List documents that have been indexed in the knowledge base. Returns file names, file IDs, and modification dates. Use this to see what's available before searching. +• 'retrieve': Retrieve document content by file ID in paginated chunks (default 10 chunks per call). Use chunkStart/chunkEnd to paginate. Returns chunk range and totalChunks so you can fetch more. Use this to read uploaded files (PDF, DOCX, PPTX, TXT, XLSX, etc.). +• 'list_indexed': List documents indexed in the Document Hub (does NOT include files uploaded in chat). Returns file names, file IDs, and modification dates. WHEN TO USE 'search': • Knowledge base lookups: policies, procedures, documentation • Questions about stored documents and content • Finding information when you don't know exact field values +SEARCH STRATEGY — file ID priority: +When the user's message contains file IDs (e.g. from uploaded attachments), ALWAYS pass those IDs in the 'fileIds' parameter first to search within those specific files. If that returns no relevant results, retry WITHOUT fileIds to perform a broader knowledge base search. This ensures uploaded/referenced files are prioritized while still falling back to the full knowledge base when needed. + +WHEN TO USE 'retrieve': +• Reading content of a specific uploaded file (paginated, 10 chunks per call by default) +• When a user uploads a file and asks you to read, summarize, or analyze it +• For large documents, retrieve returns the first page — use chunkStart/chunkEnd to read more, or use 'search' with a query for targeted lookup + WHEN TO USE 'list_indexed': -• See which files are available for RAG search -• Get file IDs for use with the search operation's fileIds parameter -• Check when files were last modified +• See which documents are in the Document Hub (org/team knowledge base) +• Get file IDs for use with the search or retrieve operations +• Check when documents were last modified +• NOTE: This only lists Document Hub files. Files uploaded in chat are NOT included — their file IDs are already in the conversation context. WHEN NOT TO USE: • "How many customers?" → Use customer_read with operation='list' @@ -155,6 +185,69 @@ RESPONSE (list_indexed): }); } + if (args.operation === 'retrieve') { + const DEFAULT_PAGE_SIZE = 10; + const start = args.chunkStart ?? 1; + const end = args.chunkEnd ?? start + DEFAULT_PAGE_SIZE - 1; + + debugLog('tool:rag_search retrieve start', { + fileId: args.fileId, + chunkStart: start, + chunkEnd: end, + }); + + const ragServiceUrl = getRagConfig().serviceUrl; + const url = `${ragServiceUrl}/api/v1/documents/${encodeURIComponent(args.fileId)}/content?return_chunks=true&chunk_start=${start}&chunk_end=${end}`; + const response = await fetch(url); + + if (!response.ok) { + const errorText = await response.text().catch(() => ''); + return { + success: false, + response: `Failed to retrieve document: ${response.status} ${errorText}`, + }; + } + + interface RetrieveResponse { + file_id: string; + title: string | null; + total_chunks: number; + total_chars: number; + chunk_range: { start: number; end: number }; + chunks: Array<{ index: number; content: string }> | null; + source_created_at: string | null; + source_modified_at: string | null; + } + const result = await fetchJson(response); + + const text = (result.chunks ?? []) + .sort((a, b) => a.index - b.index) + .map((c) => c.content) + .join('\n'); + + const hasMore = result.chunk_range.end < result.total_chunks; + + debugLog('tool:rag_search retrieve success', { + fileId: args.fileId, + chunkRange: result.chunk_range, + totalChunks: result.total_chunks, + textLength: text.length, + hasMore, + }); + + return { + success: true, + response: text || 'Document has no text content.', + fileId: result.file_id, + filename: result.title, + sourceCreatedAt: result.source_created_at, + sourceModifiedAt: result.source_modified_at, + totalChunks: result.total_chunks, + chunkRange: result.chunk_range, + hasMore, + }; + } + // operation === 'search' debugLog('tool:rag_search start', { query: args.query, diff --git a/services/platform/convex/documents/generate_document_helpers.ts b/services/platform/convex/documents/generate_document_helpers.ts index 7967cae32b..6f286c1fb4 100644 --- a/services/platform/convex/documents/generate_document_helpers.ts +++ b/services/platform/convex/documents/generate_document_helpers.ts @@ -33,7 +33,9 @@ export function getEndpointPath( ? 'pdf' : outputFormat === 'docx' ? 'docx' - : 'images'; + : outputFormat === 'pptx' + ? 'pptx' + : 'images'; return `/api/v1/${formatPath}/from-${sourceType}`; } @@ -152,6 +154,13 @@ export function getOutputInfo( extension: 'docx', }; } + if (outputFormat === 'pptx') { + return { + contentType: + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + extension: 'pptx', + }; + } const type = imageType ?? 'png'; return { contentType: type === 'png' ? 'image/png' : 'image/jpeg', diff --git a/services/platform/convex/documents/generate_docx_from_template.ts b/services/platform/convex/documents/generate_docx_from_template.ts deleted file mode 100644 index e2f9939025..0000000000 --- a/services/platform/convex/documents/generate_docx_from_template.ts +++ /dev/null @@ -1,156 +0,0 @@ -/** - * Generate a DOCX document from a template via the crawler service. - * - * This is the model-layer helper; Convex actions should call this via a thin - * wrapper in `convex/documents.ts`. - */ - -import { decode as decodeBase64 } from 'base64-arraybuffer'; - -import { fetchJson } from '../../lib/utils/type-cast-helpers'; -import { internal } from '../_generated/api'; -import type { Id } from '../_generated/dataModel'; -import type { ActionCtx } from '../_generated/server'; -import { createDebugLog } from '../lib/debug_log'; -import { buildDownloadUrl, getCrawlerUrl } from './generate_document_helpers'; -import type { DocxContent } from './generate_docx'; - -const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]'); - -export interface GenerateDocxFromTemplateArgs { - organizationId: string; - fileName: string; - content: DocxContent; - templateStorageId: Id<'_storage'>; -} - -export interface GenerateDocxFromTemplateResult { - success: boolean; - fileStorageId: Id<'_storage'>; - downloadUrl: string; - fileName: string; - contentType: string; - size: number; -} - -/** - * Generate a DOCX from content using a template as the base. - * - * When templateStorageId is provided, uses the template as a base, preserving - * all styling, headers/footers, and document properties. - */ -export async function generateDocxFromTemplate( - ctx: ActionCtx, - args: GenerateDocxFromTemplateArgs, -): Promise { - const crawlerUrl = getCrawlerUrl(); - const apiUrl = `${crawlerUrl}/api/v1/docx/from-template`; - - // Prepare content as JSON string - const contentJson = JSON.stringify(args.content); - - debugLog('documents.generateDocxFromTemplate start', { - fileName: args.fileName, - sectionsCount: args.content.sections.length, - templateStorageId: args.templateStorageId, - }); - - // Create FormData with content - const formData = new FormData(); - formData.append('content', contentJson); - - // Download template and add to form data - const templateUrl = await ctx.storage.getUrl(args.templateStorageId); - if (!templateUrl) { - throw new Error('Template file not found in storage'); - } - - debugLog('documents.generateDocxFromTemplate downloading template', { - templateStorageId: args.templateStorageId, - }); - - const templateResponse = await fetch(templateUrl); - if (!templateResponse.ok) { - throw new Error(`Failed to download template: ${templateResponse.status}`); - } - - const templateBlob = await templateResponse.blob(); - formData.append('template_file', templateBlob, 'template.docx'); - - const response = await fetch(apiUrl, { - method: 'POST', - body: formData, - }); - - if (!response.ok) { - const errorText = await response.text().catch(() => ''); - console.error('[documents.generateDocxFromTemplate] crawler error', { - status: response.status, - errorText, - }); - throw new Error( - `Crawler generateDocxFromTemplate failed: ${response.status}`, - ); - } - - const result = await response.json(); - - if (!result.success || !result.file_base64) { - throw new Error(result.error || 'Failed to generate DOCX from template'); - } - - // Decode base64 and upload to Convex storage - const docxArrayBuffer = decodeBase64(result.file_base64); - const docxBytes = new Uint8Array(docxArrayBuffer); - const contentType = - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; - - const uploadUrl = await ctx.storage.generateUploadUrl(); - const uploadResponse = await fetch(uploadUrl, { - method: 'POST', - headers: { 'Content-Type': contentType }, - body: docxBytes, - }); - - if (!uploadResponse.ok) { - throw new Error(`Failed to upload DOCX: ${uploadResponse.status}`); - } - - const { storageId } = await fetchJson<{ storageId: Id<'_storage'> }>( - uploadResponse, - ); - - const finalFileName = args.fileName.toLowerCase().endsWith('.docx') - ? args.fileName - : `${args.fileName}.docx`; - - await ctx.runMutation( - internal.file_metadata.internal_mutations.saveFileMetadata, - { - organizationId: args.organizationId, - storageId, - fileName: finalFileName, - contentType, - size: docxBytes.length, - source: 'agent', - }, - ); - - // Build download URL using our custom HTTP endpoint - const downloadUrl = buildDownloadUrl(storageId, finalFileName); - - debugLog('documents.generateDocxFromTemplate success', { - fileName: finalFileName, - storageId, - size: docxBytes.length, - }); - - return { - success: true, - fileStorageId: storageId, - downloadUrl, - fileName: finalFileName, - contentType, - size: docxBytes.length, - }; -} diff --git a/services/platform/convex/documents/generate_pptx.ts b/services/platform/convex/documents/generate_pptx.ts deleted file mode 100644 index 9895119b68..0000000000 --- a/services/platform/convex/documents/generate_pptx.ts +++ /dev/null @@ -1,203 +0,0 @@ -/** - * Generate a PPTX document via the crawler service and store it in Convex storage. - * - * This is the model-layer helper; Convex actions should call this via a thin - * wrapper in `convex/documents.ts`. - */ - -import { decode as decodeBase64 } from 'base64-arraybuffer'; - -import { fetchJson } from '../../lib/utils/type-cast-helpers'; -import { internal } from '../_generated/api'; -import type { Id } from '../_generated/dataModel'; -import type { ActionCtx } from '../_generated/server'; -import { createDebugLog } from '../lib/debug_log'; -import { buildDownloadUrl, getCrawlerUrl } from './generate_document_helpers'; - -const debugLog = createDebugLog('DEBUG_DOCUMENTS', '[Documents]'); - -/** - * Table data for PPTX generation. - */ -export interface TableData { - headers: string[]; - rows: string[][]; -} - -/** - * Content for a single slide in the PPTX. - * Backend automatically selects the best layout based on content fields. - */ -export interface SlideContentData { - title?: string; - subtitle?: string; - textContent?: string[]; - bulletPoints?: string[]; - tables?: TableData[]; -} - -/** - * Branding/styling information for the PPTX. - */ -export interface PptxBrandingData { - slideWidth?: number; - slideHeight?: number; - titleFontName?: string; - bodyFontName?: string; - titleFontSize?: number; - bodyFontSize?: number; - primaryColor?: string; - secondaryColor?: string; - accentColor?: string; -} - -export interface GeneratePptxArgs { - organizationId: string; - fileName: string; - slidesContent: SlideContentData[]; - branding?: PptxBrandingData; - /** Template storage ID - uses template as base preserving styling */ - templateStorageId: Id<'_storage'>; -} - -export interface GeneratePptxResult { - success: boolean; - fileStorageId: Id<'_storage'>; - downloadUrl: string; - fileName: string; - contentType: string; - size: number; -} - -/** - * Generate a PPTX from content using the crawler service. - * - * When templateStorageId is provided, uses the template as a base, preserving - * all styling, backgrounds, and decorative elements. - * - * When no template is provided, creates a new blank presentation. - */ -export async function generatePptx( - ctx: ActionCtx, - args: GeneratePptxArgs, -): Promise { - const crawlerUrl = getCrawlerUrl(); - const apiUrl = `${crawlerUrl}/api/v1/pptx`; - - // Prepare slide content as JSON string - const slidesContentJson = JSON.stringify(args.slidesContent); - - debugLog('documents.generatePptx start', { - fileName: args.fileName, - slidesCount: args.slidesContent.length, - hasBranding: !!args.branding, - templateStorageId: args.templateStorageId, - }); - - // Create FormData with slides content and optional branding - const formData = new FormData(); - formData.append('slides_content', slidesContentJson); - if (args.branding) { - formData.append('branding', JSON.stringify(args.branding)); - } - - // Download template and add to form data - const templateUrl = await ctx.storage.getUrl(args.templateStorageId); - if (!templateUrl) { - throw new Error('Template file not found in storage'); - } - - debugLog('documents.generatePptx downloading template', { - templateStorageId: args.templateStorageId, - }); - - const templateResponse = await fetch(templateUrl); - if (!templateResponse.ok) { - throw new Error(`Failed to download template: ${templateResponse.status}`); - } - - const templateBlob = await templateResponse.blob(); - formData.append('template_file', templateBlob, 'template.pptx'); - - const response = await fetch(apiUrl, { - method: 'POST', - body: formData, - }); - - if (!response.ok) { - const errorText = await response.text().catch(() => ''); - console.error('[documents.generatePptx] crawler error', { - status: response.status, - errorText, - }); - // Include detailed error in message for AI to see - const errorDetail = errorText ? `: ${errorText}` : ''; - throw new Error( - `PPTX generation failed (HTTP ${response.status})${errorDetail}`, - ); - } - - const result = await response.json(); - - if (!result.success || !result.file_base64) { - // Pass through detailed error from crawler service - const errorMsg = result.error || 'Unknown error during PPTX generation'; - throw new Error(`PPTX generation failed: ${errorMsg}`); - } - - // Decode base64 and upload to Convex storage - const pptxArrayBuffer = decodeBase64(result.file_base64); - const pptxBytes = new Uint8Array(pptxArrayBuffer); - const contentType = - 'application/vnd.openxmlformats-officedocument.presentationml.presentation'; - - const uploadUrl = await ctx.storage.generateUploadUrl(); - const uploadResponse = await fetch(uploadUrl, { - method: 'POST', - headers: { 'Content-Type': contentType }, - body: pptxBytes, - }); - - if (!uploadResponse.ok) { - throw new Error(`Failed to upload PPTX: ${uploadResponse.status}`); - } - - const { storageId } = await fetchJson<{ storageId: Id<'_storage'> }>( - uploadResponse, - ); - - const finalFileName = args.fileName.toLowerCase().endsWith('.pptx') - ? args.fileName - : `${args.fileName}.pptx`; - - await ctx.runMutation( - internal.file_metadata.internal_mutations.saveFileMetadata, - { - organizationId: args.organizationId, - storageId, - fileName: finalFileName, - contentType, - size: pptxBytes.length, - source: 'agent', - }, - ); - - // Build download URL using our custom HTTP endpoint that sets Content-Disposition - // This ensures the downloaded file has the correct filename instead of the storage ID - const downloadUrl = buildDownloadUrl(storageId, finalFileName); - - debugLog('documents.generatePptx success', { - fileName: finalFileName, - storageId, - size: pptxBytes.length, - }); - - return { - success: true, - fileStorageId: storageId, - downloadUrl, - fileName: finalFileName, - contentType, - size: pptxBytes.length, - }; -} diff --git a/services/platform/convex/documents/helpers.ts b/services/platform/convex/documents/helpers.ts index 7614754725..09938cf69f 100644 --- a/services/platform/convex/documents/helpers.ts +++ b/services/platform/convex/documents/helpers.ts @@ -30,11 +30,8 @@ export * from './get_onedrive_sync_configs'; export * from './upload_base64_to_storage'; export * from './read_file_base64_from_storage'; export * from './generate_document'; -export * from './generate_pptx'; export * from './generate_docx'; -export * from './generate_docx_from_template'; export * from './extract_extension'; -export * from './list_documents_by_extension'; export * from './find_document_by_title'; export * from './find_document_by_external_id'; export * from './find_document_by_file_id'; diff --git a/services/platform/convex/documents/internal_actions.ts b/services/platform/convex/documents/internal_actions.ts index c459420465..6399f84858 100644 --- a/services/platform/convex/documents/internal_actions.ts +++ b/services/platform/convex/documents/internal_actions.ts @@ -3,6 +3,7 @@ import { v } from 'convex/values'; import { extractExtension } from '../../lib/shared/file-types'; +import { fetchJson } from '../../lib/utils/type-cast-helpers'; import { isRecord, getBoolean, @@ -10,13 +11,13 @@ import { getString, } from '../../lib/utils/type-guards'; import { internal } from '../_generated/api'; +import type { Id } from '../_generated/dataModel'; import { internalAction } from '../_generated/server'; +import { buildDownloadUrl } from '../lib/helpers/public_storage_url'; import { getRagConfig } from '../lib/helpers/rag_config'; import { ragAction } from '../workflow_engine/action_defs/rag/rag_action'; import { getCrawlerUrl } from './generate_document_helpers'; import type { GenerateDocxResult } from './generate_docx'; -import type { GenerateDocxFromTemplateResult } from './generate_docx_from_template'; -import type { GeneratePptxResult } from './generate_pptx'; import * as DocumentsHelpers from './helpers'; import type { GenerateDocumentResult } from './types'; @@ -39,6 +40,7 @@ const documentOutputFormatValidator = v.union( v.literal('pdf'), v.literal('image'), v.literal('docx'), + v.literal('pptx'), ); const pdfOptionsValidator = v.optional( @@ -77,33 +79,6 @@ const urlOptionsValidator = v.optional( }), ); -const tableDataValidator = v.object({ - headers: v.array(v.string()), - rows: v.array(v.array(v.string())), -}); - -const slideContentValidator = v.object({ - title: v.optional(v.string()), - subtitle: v.optional(v.string()), - textContent: v.optional(v.array(v.string())), - bulletPoints: v.optional(v.array(v.string())), - tables: v.optional(v.array(tableDataValidator)), -}); - -const pptxBrandingValidator = v.optional( - v.object({ - slideWidth: v.optional(v.number()), - slideHeight: v.optional(v.number()), - titleFontName: v.optional(v.string()), - bodyFontName: v.optional(v.string()), - titleFontSize: v.optional(v.number()), - bodyFontSize: v.optional(v.number()), - primaryColor: v.optional(v.string()), - secondaryColor: v.optional(v.string()), - accentColor: v.optional(v.string()), - }), -); - const docxSectionValidator = v.object({ type: v.union( v.literal('heading'), @@ -145,19 +120,6 @@ export const generateDocument = internalAction({ }, }); -export const generatePptx = internalAction({ - args: { - organizationId: v.string(), - fileName: v.string(), - slidesContent: v.array(slideContentValidator), - branding: pptxBrandingValidator, - templateStorageId: v.id('_storage'), - }, - handler: async (ctx, args): Promise => { - return await DocumentsHelpers.generatePptx(ctx, args); - }, -}); - export const generateDocx = internalAction({ args: { organizationId: v.string(), @@ -169,18 +131,6 @@ export const generateDocx = internalAction({ }, }); -export const generateDocxFromTemplate = internalAction({ - args: { - organizationId: v.string(), - fileName: v.string(), - content: docxContentValidator, - templateStorageId: v.id('_storage'), - }, - handler: async (ctx, args): Promise => { - return await DocumentsHelpers.generateDocxFromTemplate(ctx, args); - }, -}); - /** * Progressive intervals to cover ~24 hours with 50 attempts: * - Attempts 1-30: 2 minutes each (~60 minutes total) @@ -684,6 +634,71 @@ export const reindexDocumentInRag = internalAction({ }, }); +/** + * Store raw string content (e.g. HTML) directly as a file in Convex storage. + * Used by tools that generate content locally without the crawler service. + */ +export const storeRawContent = internalAction({ + args: { + organizationId: v.string(), + fileName: v.string(), + content: v.string(), + contentType: v.string(), + extension: v.string(), + }, + handler: async (ctx, args): Promise => { + const bytes = new TextEncoder().encode(args.content); + const size = bytes.byteLength; + + const uploadUrl = await ctx.storage.generateUploadUrl(); + const uploadResponse = await fetch(uploadUrl, { + method: 'POST', + headers: { 'Content-Type': args.contentType }, + body: bytes, + }); + + if (!uploadResponse.ok) { + throw new Error( + `Failed to upload content: ${uploadResponse.status} ${uploadResponse.statusText}`, + ); + } + + const { storageId } = await fetchJson<{ storageId: Id<'_storage'> }>( + uploadResponse, + ); + + const lowerFileName = args.fileName.toLowerCase(); + const expectedSuffix = `.${args.extension.toLowerCase()}`; + const finalFileName = lowerFileName.endsWith(expectedSuffix) + ? args.fileName + : `${args.fileName}.${args.extension}`; + + await ctx.runMutation( + internal.file_metadata.internal_mutations.saveFileMetadata, + { + organizationId: args.organizationId, + storageId, + fileName: finalFileName, + contentType: args.contentType, + size, + source: 'agent', + }, + ); + + const downloadUrl = buildDownloadUrl(storageId, finalFileName); + + return { + success: true, + fileStorageId: storageId, + downloadUrl, + fileName: finalFileName, + contentType: args.contentType, + size, + extension: args.extension, + }; + }, +}); + const EXTRACT_DATES_SUPPORTED_EXTENSIONS = new Set(['pdf', 'docx', 'pptx']); const EXTRACT_DATES_RETRY_DELAYS = [30_000, 60_000, 120_000]; diff --git a/services/platform/convex/documents/internal_queries.ts b/services/platform/convex/documents/internal_queries.ts index 5343832804..61b6541d80 100644 --- a/services/platform/convex/documents/internal_queries.ts +++ b/services/platform/convex/documents/internal_queries.ts @@ -19,17 +19,6 @@ export const getDocumentByIdRaw = internalQuery({ }, }); -export const listDocumentsByExtension = internalQuery({ - args: { - organizationId: v.string(), - extension: v.string(), - limit: v.optional(v.number()), - }, - handler: async (ctx, args) => { - return await DocumentsHelpers.listDocumentsByExtension(ctx, args); - }, -}); - export const queryDocuments = internalQuery({ args: { organizationId: v.string(), diff --git a/services/platform/convex/documents/list_documents_by_extension.ts b/services/platform/convex/documents/list_documents_by_extension.ts deleted file mode 100644 index 8baef23f65..0000000000 --- a/services/platform/convex/documents/list_documents_by_extension.ts +++ /dev/null @@ -1,39 +0,0 @@ -/** - * List documents by file extension - * - * Uses the by_organizationId_and_extension index to efficiently query - * documents of a specific type (e.g., 'pptx', 'pdf', 'docx'). - */ - -import type { QueryCtx } from '../_generated/server'; -import type { - ListDocumentsByExtensionArgs, - ListDocumentsByExtensionResult, -} from './types'; - -export async function listDocumentsByExtension( - ctx: QueryCtx, - args: ListDocumentsByExtensionArgs, -): Promise { - const limit = args.limit ?? 50; - - const documents = await ctx.db - .query('documents') - .withIndex('by_organizationId_and_extension', (q) => - q - .eq('organizationId', args.organizationId) - .eq('extension', args.extension), - ) - .order('desc') - .take(limit); - - return documents.map((doc) => ({ - _id: doc._id, - _creationTime: doc._creationTime, - title: doc.title, - fileId: doc.fileId, - mimeType: doc.mimeType, - extension: doc.extension, - metadata: doc.metadata, - })); -} diff --git a/services/platform/convex/documents/types.ts b/services/platform/convex/documents/types.ts index 6f1914527f..265dc9ec69 100644 --- a/services/platform/convex/documents/types.ts +++ b/services/platform/convex/documents/types.ts @@ -121,7 +121,7 @@ export type ListDocumentsByExtensionResult = Array<{ export type DocumentSourceType = 'markdown' | 'html' | 'url'; -export type DocumentOutputFormat = 'pdf' | 'image' | 'docx'; +export type DocumentOutputFormat = 'pdf' | 'image' | 'docx' | 'pptx'; export interface GenerateDocumentPdfOptions { format?: string; // A4, Letter, Legal, etc. diff --git a/services/platform/convex/documents/validators.ts b/services/platform/convex/documents/validators.ts index 9f128c1388..4e00158b41 100644 --- a/services/platform/convex/documents/validators.ts +++ b/services/platform/convex/documents/validators.ts @@ -93,15 +93,6 @@ export const generateDocumentResponseValidator = v.object({ size: v.number(), }); -export const generatePptxResponseValidator = v.object({ - success: v.boolean(), - fileStorageId: v.string(), - downloadUrl: v.string(), - fileName: v.string(), - contentType: v.string(), - size: v.number(), -}); - export const generateDocxResponseValidator = v.object({ success: v.boolean(), fileStorageId: v.string(), diff --git a/services/platform/convex/file_metadata/__tests__/internal_mutations.test.ts b/services/platform/convex/file_metadata/__tests__/internal_mutations.test.ts index 2ea106ae08..d08749af4a 100644 --- a/services/platform/convex/file_metadata/__tests__/internal_mutations.test.ts +++ b/services/platform/convex/file_metadata/__tests__/internal_mutations.test.ts @@ -34,6 +34,7 @@ vi.mock('../../lib/rate_limiter/helpers', () => ({ vi.mock('../../_generated/api', () => ({ internal: { governance: { retention_cleanup: { runRetentionCleanup: 'mock' } }, + file_metadata: { internal_actions: { uploadFileToRag: 'mock' } }, }, })); @@ -93,6 +94,7 @@ describe('saveFileMetadata (internal)', () => { fileName: 'test.pdf', contentType: 'application/pdf', size: 1024, + ragStatus: 'queued', }); expect(ctx.db.patch).not.toHaveBeenCalled(); }); @@ -150,6 +152,7 @@ describe('saveFileMetadata (internal)', () => { contentType: 'application/pdf', size: 1024, documentId: 'doc_1', + ragStatus: 'queued', }); }); diff --git a/services/platform/convex/file_metadata/__tests__/mutations.test.ts b/services/platform/convex/file_metadata/__tests__/mutations.test.ts index a0f117ea13..8674c6c969 100644 --- a/services/platform/convex/file_metadata/__tests__/mutations.test.ts +++ b/services/platform/convex/file_metadata/__tests__/mutations.test.ts @@ -34,6 +34,7 @@ vi.mock('../../lib/rate_limiter/helpers', () => ({ vi.mock('../../_generated/api', () => ({ internal: { governance: { retention_cleanup: { runRetentionCleanup: 'mock' } }, + file_metadata: { internal_actions: { uploadFileToRag: 'mock' } }, }, })); @@ -110,6 +111,7 @@ describe('saveFileMetadata (public)', () => { fileName: 'test.pdf', contentType: 'application/pdf', size: 1024, + ragStatus: 'queued', }); expect(ctx.db.patch).not.toHaveBeenCalled(); }); @@ -170,6 +172,7 @@ describe('saveFileMetadata (public)', () => { contentType: 'application/pdf', size: 1024, documentId: 'doc_1', + ragStatus: 'queued', }); }); diff --git a/services/platform/convex/file_metadata/actions.ts b/services/platform/convex/file_metadata/actions.ts new file mode 100644 index 0000000000..a12c17f662 --- /dev/null +++ b/services/platform/convex/file_metadata/actions.ts @@ -0,0 +1,94 @@ +'use node'; + +import { v } from 'convex/values'; + +import { isRecord, getString } from '../../lib/utils/type-guards'; +import { internal } from '../_generated/api'; +import { action } from '../_generated/server'; +import { getRagConfig } from '../lib/helpers/rag_config'; + +/** + * Check RAG indexing status for a list of files and update fileMetadata. + * + * Called by the frontend on an interval while files are in queued/running + * state. Stops being called when the user leaves the page — no wasted + * server-side scheduled actions. + */ +export const checkFileRagStatuses = action({ + args: { + storageIds: v.array(v.id('_storage')), + }, + returns: v.null(), + handler: async (ctx, args): Promise => { + if (args.storageIds.length === 0) return null; + + const ragUrl = getRagConfig().serviceUrl; + if (!ragUrl) return null; + + const url = `${ragUrl}/api/v1/documents/statuses`; + + let body: unknown; + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ file_ids: args.storageIds }), + signal: AbortSignal.timeout(10_000), + }); + + if (!response.ok) { + console.warn(`[checkFileRagStatuses] RAG returned ${response.status}`); + return null; + } + + body = await response.json(); + } catch (error) { + console.warn('[checkFileRagStatuses] Failed to fetch statuses:', error); + return null; + } + + if (!isRecord(body) || !isRecord(body.statuses)) { + return null; + } + + const statuses = body.statuses; + + for (const storageId of args.storageIds) { + const docStatus = statuses[storageId]; + if (!isRecord(docStatus)) continue; + + const status = getString(docStatus, 'status'); + const error = getString(docStatus, 'error'); + const progressPhase = getString(docStatus, 'progress_phase'); + const progressDetail = getString(docStatus, 'progress_detail'); + + const ragProgress = + progressPhase && progressDetail + ? `${progressPhase} ${progressDetail}` + : progressPhase || undefined; + + if (status === 'completed') { + await ctx.runMutation( + internal.file_metadata.internal_mutations.updateFileRagStatus, + { storageId, ragStatus: 'completed' }, + ); + } else if (status === 'failed') { + await ctx.runMutation( + internal.file_metadata.internal_mutations.updateFileRagStatus, + { + storageId, + ragStatus: 'failed', + ragError: error || 'Unknown error', + }, + ); + } else if (status === 'processing') { + await ctx.runMutation( + internal.file_metadata.internal_mutations.updateFileRagStatus, + { storageId, ragStatus: 'running', ragProgress }, + ); + } + } + + return null; + }, +}); diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts new file mode 100644 index 0000000000..38366ef723 --- /dev/null +++ b/services/platform/convex/file_metadata/internal_actions.ts @@ -0,0 +1,56 @@ +'use node'; + +import { v } from 'convex/values'; + +import { internal } from '../_generated/api'; +import { internalAction } from '../_generated/server'; +import { getRagConfig } from '../lib/helpers/rag_config'; +import { ragAction } from '../workflow_engine/action_defs/rag/rag_action'; + +/** + * Upload a file to the RAG service for indexing. + * + * Triggered by saveFileMetadata on new inserts. Only uploads — status + * polling is driven by the client via checkFileRagStatus. + */ +export const uploadFileToRag = internalAction({ + args: { + storageId: v.id('_storage'), + fileName: v.string(), + contentType: v.string(), + }, + returns: v.null(), + handler: async (ctx, args): Promise => { + const ragConfig = getRagConfig(); + if (!ragConfig.serviceUrl) { + return null; + } + + try { + await ragAction.execute( + ctx, + { + operation: 'upload_document', + fileId: args.storageId, + fileName: args.fileName, + contentType: args.contentType, + }, + {}, + ); + } catch (error) { + console.error( + `[uploadFileToRag] Failed to upload file ${args.storageId}: ${error instanceof Error ? error.message : String(error)}`, + ); + await ctx.runMutation( + internal.file_metadata.internal_mutations.updateFileRagStatus, + { + storageId: args.storageId, + ragStatus: 'failed', + ragError: error instanceof Error ? error.message : String(error), + }, + ); + } + + return null; + }, +}); diff --git a/services/platform/convex/file_metadata/internal_mutations.ts b/services/platform/convex/file_metadata/internal_mutations.ts index 96ad1bc5b2..0ca467e494 100644 --- a/services/platform/convex/file_metadata/internal_mutations.ts +++ b/services/platform/convex/file_metadata/internal_mutations.ts @@ -45,10 +45,21 @@ export const saveFileMetadata = internalMutation({ fileName: args.fileName, contentType: args.contentType, size: args.size, + ragStatus: 'queued', ...(args.documentId !== undefined && { documentId: args.documentId }), ...(args.source !== undefined && { source: args.source }), }); + await ctx.scheduler.runAfter( + 0, + internal.file_metadata.internal_actions.uploadFileToRag, + { + storageId: args.storageId, + fileName: args.fileName, + contentType: args.contentType, + }, + ); + try { await checkOrganizationRateLimit( ctx, @@ -70,6 +81,36 @@ export const saveFileMetadata = internalMutation({ }, }); +export const updateFileRagStatus = internalMutation({ + args: { + storageId: v.id('_storage'), + ragStatus: v.union( + v.literal('queued'), + v.literal('running'), + v.literal('completed'), + v.literal('failed'), + ), + ragError: v.optional(v.string()), + ragProgress: v.optional(v.string()), + }, + async handler(ctx, args) { + const metadata = await ctx.db + .query('fileMetadata') + .withIndex('by_storageId', (q) => q.eq('storageId', args.storageId)) + .first(); + if (!metadata) return; + + await ctx.db.patch(metadata._id, { + ragStatus: args.ragStatus, + ragError: args.ragStatus === 'failed' ? args.ragError : undefined, + ragProgress: + args.ragStatus === 'completed' || args.ragStatus === 'failed' + ? undefined + : args.ragProgress, + }); + }, +}); + export const linkDocumentToFile = internalMutation({ args: { storageId: v.id('_storage'), diff --git a/services/platform/convex/file_metadata/mutations.ts b/services/platform/convex/file_metadata/mutations.ts index 0e7e4a2189..bdf3b9323e 100644 --- a/services/platform/convex/file_metadata/mutations.ts +++ b/services/platform/convex/file_metadata/mutations.ts @@ -51,10 +51,21 @@ export const saveFileMetadata = mutation({ fileName: args.fileName, contentType: args.contentType, size: args.size, + ragStatus: 'queued', ...(args.documentId !== undefined && { documentId: args.documentId }), ...(args.source !== undefined && { source: args.source }), }); + await ctx.scheduler.runAfter( + 0, + internal.file_metadata.internal_actions.uploadFileToRag, + { + storageId: args.storageId, + fileName: args.fileName, + contentType: args.contentType, + }, + ); + try { await checkOrganizationRateLimit( ctx, diff --git a/services/platform/convex/file_metadata/queries.ts b/services/platform/convex/file_metadata/queries.ts index 9d33ce900c..ac02971d6e 100644 --- a/services/platform/convex/file_metadata/queries.ts +++ b/services/platform/convex/file_metadata/queries.ts @@ -14,6 +14,16 @@ export const getByStorageIds = query({ fileName: v.string(), contentType: v.string(), size: v.number(), + ragStatus: v.optional( + v.union( + v.literal('queued'), + v.literal('running'), + v.literal('completed'), + v.literal('failed'), + ), + ), + ragError: v.optional(v.string()), + ragProgress: v.optional(v.string()), }), ), handler: async (ctx, args) => { @@ -33,6 +43,9 @@ export const getByStorageIds = query({ fileName: meta.fileName, contentType: meta.contentType, size: meta.size, + ragStatus: meta.ragStatus, + ragError: meta.ragError, + ragProgress: meta.ragProgress, }; }), ); diff --git a/services/platform/convex/file_metadata/schema.ts b/services/platform/convex/file_metadata/schema.ts index 5ffc1c87ac..845e85e71a 100644 --- a/services/platform/convex/file_metadata/schema.ts +++ b/services/platform/convex/file_metadata/schema.ts @@ -9,6 +9,16 @@ export const fileMetadataTable = defineTable({ fileName: v.string(), contentType: v.string(), size: v.number(), + ragStatus: v.optional( + v.union( + v.literal('queued'), + v.literal('running'), + v.literal('completed'), + v.literal('failed'), + ), + ), + ragError: v.optional(v.string()), + ragProgress: v.optional(v.string()), }) .index('by_organizationId', ['organizationId']) .index('by_storageId', ['storageId']) diff --git a/services/platform/convex/lib/action_cache/index.ts b/services/platform/convex/lib/action_cache/index.ts index df165ae053..9191ad3f18 100644 --- a/services/platform/convex/lib/action_cache/index.ts +++ b/services/platform/convex/lib/action_cache/index.ts @@ -30,18 +30,6 @@ export const TTL = { // File Processing Caches // ============================================ -/** - * Cache for file parsing results. - * File content is immutable per storage ID. - */ -export const parseFileCache: ActionCache< - FunctionReference<'action', 'internal'> -> = new ActionCache(components.actionCache, { - action: internal.agent_tools.files.internal_actions.parseFileUncached, - name: `parse_file_${CACHE_VERSION}`, - ttl: TTL.INDEFINITE, -}); - /** * Cache for image analysis results. * Same image + question produces same analysis. diff --git a/services/platform/convex/lib/attachments/process_attachments.ts b/services/platform/convex/lib/attachments/process_attachments.ts index 7239fd7f29..aa466aea60 100644 --- a/services/platform/convex/lib/attachments/process_attachments.ts +++ b/services/platform/convex/lib/attachments/process_attachments.ts @@ -5,21 +5,11 @@ * including document parsing and image metadata extraction. */ -import type { LanguageModelV3 } from '@ai-sdk/provider'; - -import { - isImage, - isSpreadsheet, - isTextFile, -} from '../../../lib/shared/file-types'; +import { isImage, isSpreadsheet } from '../../../lib/shared/file-types'; import { internal } from '../../_generated/api'; -import type { Id } from '../../_generated/dataModel'; import type { ActionCtx } from '../../_generated/server'; import { analyzeImageCached } from '../../agent_tools/files/helpers/analyze_image'; -import { analyzeTextContent } from '../../agent_tools/files/helpers/analyze_text'; -import { parseFile } from '../../agent_tools/files/helpers/parse_file'; import { toId } from '../../lib/type_cast_helpers'; -import { resolveLanguageModel } from '../../providers/resolve_model'; import { registerFilesWithAgent } from './register_files'; import type { FileAttachment, MessageContentPart } from './types'; @@ -37,16 +27,16 @@ export interface ParsedDocument { */ export interface ImageInfo { fileName: string; - fileId: Id<'_storage'>; + fileId: string; url: string | undefined; } /** - * Text file info for the txt tool's parse operation + * Text file info */ export interface TextFileInfo { fileName: string; - fileId: Id<'_storage'>; + fileId: string; fileSize: number; } @@ -70,19 +60,17 @@ export interface ProcessAttachmentsConfig { debugLog?: (message: string, data?: Record) => void; toolName?: string; model?: string; - languageModel?: LanguageModelV3; } -const DEFAULT_MAX_DOCUMENT_LENGTH = 50000; - /** * Process file attachments for an AI agent. * * This function: - * 1. Separates images from documents - * 2. Parses documents to extract text content - * 3. Prepares image metadata for the image tool - * 4. Builds multi-modal prompt content + * 1. Separates images from other files + * 2. Analyzes images with vision model + * 3. Parses spreadsheets for structured data + * 4. Lists documents and text files for the agent to retrieve via rag_search + * 5. Builds multi-modal prompt content * * @param ctx - Action context for storage access * @param attachments - Array of file attachments to process @@ -95,9 +83,7 @@ export async function processAttachments( userText: string | undefined, config: ProcessAttachmentsConfig & { model: string }, ): Promise { - const maxDocLength = config?.maxDocumentLength ?? DEFAULT_MAX_DOCUMENT_LENGTH; const debugLog = config?.debugLog ?? (() => {}); - const toolName = config?.toolName ?? 'agent'; if (!attachments || attachments.length === 0) { return { @@ -113,67 +99,15 @@ export async function processAttachments( files: attachments.map((a) => ({ name: a.fileName, type: a.fileType })), }); - // Separate images, text files, spreadsheets, and other documents + // Separate images, spreadsheets, and other files (documents + text) const imageAttachments = attachments.filter((a) => isImage(a.fileType)); const spreadsheetAttachments = attachments.filter( (a) => !isImage(a.fileType) && isSpreadsheet(a.fileName), ); - const textFileAttachments = attachments.filter( - (a) => - !isImage(a.fileType) && - !isSpreadsheet(a.fileName) && - isTextFile(a.fileType, a.fileName), - ); - const documentAttachments = attachments.filter( - (a) => - !isImage(a.fileType) && - !isSpreadsheet(a.fileName) && - !isTextFile(a.fileType, a.fileName), + const fileAttachments = attachments.filter( + (a) => !isImage(a.fileType) && !isSpreadsheet(a.fileName), ); - // Parse document files to extract their text content (in parallel) - const parseResults = await Promise.all( - documentAttachments.map(async (attachment) => { - try { - const parseResult = await parseFile( - ctx, - attachment.fileId, - attachment.fileName, - toolName, - userText, - ); - return { attachment, parseResult }; - } catch (error) { - debugLog('Error parsing document', { - fileName: attachment.fileName, - error: String(error), - }); - return null; - } - }), - ); - - const parsedDocuments: ParsedDocument[] = []; - - for (const result of parseResults) { - if (result?.parseResult.success && result.parseResult.full_text) { - parsedDocuments.push({ - fileId: result.attachment.fileId, - fileName: result.attachment.fileName, - content: result.parseResult.full_text, - }); - debugLog('Parsed document', { - fileName: result.attachment.fileName, - textLength: result.parseResult.full_text.length, - }); - } else if (result) { - debugLog('Failed to parse document', { - fileName: result.attachment.fileName, - error: result.parseResult.error, - }); - } - } - // Parse spreadsheet files using the xlsx library (in parallel) const spreadsheetResults = await Promise.all( spreadsheetAttachments.map(async (attachment) => { @@ -238,66 +172,9 @@ export async function processAttachments( (r): r is { fileName: string; analysis: string } => r !== null, ); - // Resolve language model for text analysis if not provided - let resolvedLanguageModelV3 = config.languageModel; - if (!resolvedLanguageModelV3 && textFileAttachments.length > 0) { - const resolved = await resolveLanguageModel(ctx, { tag: 'chat' }); - resolvedLanguageModelV3 = resolved.languageModel; - } - - // Analyze text files with LLM (in parallel) - const textAnalysisResults = await Promise.all( - textFileAttachments.map(async (attachment) => { - try { - const result = await analyzeTextContent(ctx, { - fileId: attachment.fileId, - filename: attachment.fileName, - userInput: userText || 'Analyze this file', - model: config.model, - // resolvedLanguageModelV3 is guaranteed set: either from config or resolved above - // oxlint-disable-next-line typescript/no-non-null-assertion -- guard above ensures non-null - languageModel: resolvedLanguageModelV3!, - }); - - if (result.success) { - return { - fileName: attachment.fileName, - analysis: result.result, - charCount: result.charCount, - lineCount: result.lineCount, - }; - } else { - debugLog('Text file analysis failed', { - fileName: attachment.fileName, - error: result.error, - }); - return null; - } - } catch (error) { - debugLog('Error analyzing text file', { - fileName: attachment.fileName, - error: String(error), - }); - return null; - } - }), - ); - - const analyzedTextFiles = textAnalysisResults.filter( - ( - r, - ): r is { - fileName: string; - analysis: string; - charCount: number; - lineCount: number; - } => r !== null, - ); - - // Register files with the agent component for tracking (documents + spreadsheets) - // Images and text files are handled via their respective tools, not inline + // Register files with the agent component for tracking await registerFilesWithAgent(ctx, [ - ...documentAttachments, + ...fileAttachments, ...spreadsheetAttachments, ]); @@ -306,10 +183,7 @@ export async function processAttachments( const contentParts: MessageContentPart[] = [{ type: 'text', text }]; const hasAnalyzedContent = - parsedDocuments.length > 0 || - parsedSpreadsheets.length > 0 || - analyzedImages.length > 0 || - analyzedTextFiles.length > 0; + parsedSpreadsheets.length > 0 || analyzedImages.length > 0; if (hasAnalyzedContent) { contentParts.push({ @@ -317,19 +191,6 @@ export async function processAttachments( text: '\n\n[PRE-ANALYZED CONTENT BELOW - This is the attachment from the CURRENT message. It takes priority over any previous context. Answer directly from this content without delegating to document tools.]', }); - for (const doc of parsedDocuments) { - const truncatedContent = - doc.content.length > maxDocLength - ? doc.content.slice(0, maxDocLength) + - '\n\n[... Document truncated due to length ...]' - : doc.content; - - contentParts.push({ - type: 'text', - text: `\n\n---\n**Document: ${doc.fileName}** (fileId: ${doc.fileId})\n\n${truncatedContent}\n---\n`, - }); - } - for (const { attachment, result } of parsedSpreadsheets) { const sheetTexts = result.sheets.map((sheet) => { const headerRow = sheet.headers.join(' | '); @@ -356,41 +217,27 @@ export async function processAttachments( text: `\n\n---\n**Image: ${img.fileName}**\n\n${img.analysis}\n---\n`, }); } - - for (const txt of analyzedTextFiles) { - contentParts.push({ - type: 'text', - text: `\n\n---\n**Text File: ${txt.fileName}** (${txt.charCount} chars, ${txt.lineCount} lines)\n\n${txt.analysis}\n---\n`, - }); - } } - // Collect attachments that failed pre-analysis — include their references - // so the agent can use its tools (docx, pdf, image, etc.) to process them - const failedDocuments = documentAttachments.filter( - (a) => !parsedDocuments.some((d) => d.fileName === a.fileName), - ); + // List documents, text files, and failed attachments for the agent to process + // via rag_search tool (retrieve operation) const failedImages = imageAttachments.filter( (a) => !analyzedImages.some((d) => d.fileName === a.fileName), ); - const failedTextFiles = textFileAttachments.filter( - (a) => !analyzedTextFiles.some((d) => d.fileName === a.fileName), - ); const failedSpreadsheets = spreadsheetAttachments.filter( (a) => !parsedSpreadsheets.some((d) => d.attachment.fileName === a.fileName), ); const unprocessedAttachments = [ - ...failedDocuments, + ...fileAttachments, ...failedImages, - ...failedTextFiles, ...failedSpreadsheets, ]; if (unprocessedAttachments.length > 0) { contentParts.push({ type: 'text', - text: '\n\n[ATTACHED FILES - Pre-analysis was not available. Use your tools to process these files.]', + text: '\n\n[ATTACHED FILES - Use rag_search tool with operation="retrieve" and the fileId to read these files.]', }); for (const attachment of unprocessedAttachments) { @@ -409,7 +256,7 @@ export async function processAttachments( : undefined; return { - parsedDocuments, + parsedDocuments: [], imageInfoList: [], textFileInfoList: [], promptContent, diff --git a/services/platform/convex/message_metadata/queries.ts b/services/platform/convex/message_metadata/queries.ts index dd2a87df3b..f690876100 100644 --- a/services/platform/convex/message_metadata/queries.ts +++ b/services/platform/convex/message_metadata/queries.ts @@ -6,12 +6,28 @@ import { messageMetadataValidator } from '../streaming/validators'; export const getMessageMetadata = query({ args: { messageId: v.string(), + threadId: v.optional(v.string()), }, returns: v.union(messageMetadataValidator, v.null()), handler: async (ctx, args) => { - return ctx.db + const direct = await ctx.db .query('messageMetadata') .withIndex('by_messageId', (q) => q.eq('messageId', args.messageId)) .first(); + if (direct) return direct; + + // In error scenarios, the metadata is saved with the failed message's + // ID which differs from the UIMessage id (first message in group). + // Fall back to the most recent metadata entry for this thread. + const { threadId } = args; + if (threadId) { + return ctx.db + .query('messageMetadata') + .withIndex('by_threadId', (q) => q.eq('threadId', threadId)) + .order('desc') + .first(); + } + + return null; }, }); diff --git a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts index 26c4d57929..8defe1fdc5 100644 --- a/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts +++ b/services/platform/convex/workflow_engine/action_defs/rag/rag_action.ts @@ -2,14 +2,14 @@ import { v } from 'convex/values'; import { fetchJson } from '../../../../lib/utils/type-cast-helpers'; import type { SearchResponse } from '../../../agent_tools/rag/format_search_results'; +import { fetchDocumentChunks } from '../../../agent_tools/rag/helpers/fetch_document_chunks'; import type { ActionDefinition } from '../../helpers/nodes/action/types'; import { deleteDocumentById } from './helpers/delete_document'; import { getRagConfig } from './helpers/get_rag_config'; -import type { RagActionParams, RagChunkResult } from './helpers/types'; +import type { RagActionParams } from './helpers/types'; import { uploadDocument } from './helpers/upload_document'; const SEARCH_TIMEOUT_MS = 30_000; -const MAX_CHUNK_WINDOW = 200; export const ragAction: ActionDefinition = { type: 'rag', @@ -129,59 +129,6 @@ export const ragAction: ActionDefinition = { }, }; -interface DocumentContentResponse { - file_id: string; - title: string | null; - content: string; - chunk_range: { start: number; end: number }; - total_chunks: number; - total_chars: number; - chunks: Array<{ index: number; content: string }> | null; -} - -async function fetchDocumentChunks( - serviceUrl: string, - fileId: string, -): Promise { - const allChunks: Array<{ index: number; content: string }> = []; - let totalChunks = 0; - let documentId = ''; - let title: string | null = null; - let chunkStart = 1; - - // Paginate through all chunks in MAX_CHUNK_WINDOW batches - while (true) { - const chunkEnd = chunkStart + MAX_CHUNK_WINDOW - 1; - const url = `${serviceUrl}/api/v1/documents/${encodeURIComponent(fileId)}/content?return_chunks=true&chunk_start=${chunkStart}&chunk_end=${chunkEnd}`; - - const response = await fetch(url); - - if (!response.ok) { - const errorText = await response.text().catch(() => ''); - throw new Error( - `RAG get_chunks error (${response.status}): ${errorText || 'Unknown error'}`, - ); - } - - const result = await fetchJson(response); - documentId = result.file_id; - title = result.title; - totalChunks = result.total_chunks; - - if (result.chunks) { - allChunks.push(...result.chunks); - } - - if (result.chunk_range.end >= totalChunks) { - break; - } - - chunkStart = result.chunk_range.end + 1; - } - - return { documentId, title, chunks: allChunks, totalChunks }; -} - /** * Backward compatibility: map old param names (recordId, documentIds) * to new names (fileId, fileIds) for user-created workflows stored in DB. diff --git a/services/platform/lib/shared/file-types.ts b/services/platform/lib/shared/file-types.ts index 60dd54c95b..714a925d3f 100644 --- a/services/platform/lib/shared/file-types.ts +++ b/services/platform/lib/shared/file-types.ts @@ -76,11 +76,6 @@ export function isImage(mimeType: string): boolean { return mimeType.startsWith('image/'); } -export function isTextFile(mimeType: string, fileName?: string): boolean { - if (!fileName) return mimeType.startsWith('text/plain'); - return isTextBasedFile(fileName, mimeType); -} - export function isSpreadsheet(fileName: string): boolean { const lower = fileName.toLowerCase(); return ( @@ -285,14 +280,14 @@ export const SPREADSHEET_IMPORT_ACCEPT = '.xlsx,.xls,.csv'; // Size limits // --------------------------------------------------------------------------- -/** Chat attachment max (10 MB) */ -export const CHAT_MAX_FILE_SIZE = 10 * 1024 * 1024; +/** Chat attachment max (100 MB) */ +export const CHAT_MAX_FILE_SIZE = 100 * 1024 * 1024; /** Max attachments per chat message */ export const CHAT_MAX_FILE_COUNT = 10; -/** Max total attachment size per chat message (25 MB) */ -export const CHAT_MAX_TOTAL_SIZE = 25 * 1024 * 1024; +/** Max total attachment size per chat message (200 MB) */ +export const CHAT_MAX_TOTAL_SIZE = 200 * 1024 * 1024; /** Document upload max (100 MB) */ export const DOCUMENT_MAX_FILE_SIZE = 100 * 1024 * 1024; @@ -437,24 +432,6 @@ export function hasFileTools(toolNames: readonly string[]): boolean { } // --------------------------------------------------------------------------- -// Parse endpoint routing -// --------------------------------------------------------------------------- - -const PARSE_ENDPOINTS: Record = { - pdf: '/api/v1/pdf/parse', - docx: '/api/v1/docx/parse', - pptx: '/api/v1/pptx/parse', -}; - -/** - * Get the crawler service parse endpoint for a given filename. - * Falls back to PDF parser for unknown extensions. - */ -export function getParseEndpoint(filename: string): string { - const ext = extractExtension(filename); - return (ext && PARSE_ENDPOINTS[ext]) || PARSE_ENDPOINTS.pdf; -} - // --------------------------------------------------------------------------- // MIME → display label key (for i18n) // --------------------------------------------------------------------------- diff --git a/services/platform/messages/de-CH.json b/services/platform/messages/de-CH.json index 44d09e7579..5a1706da86 100644 --- a/services/platform/messages/de-CH.json +++ b/services/platform/messages/de-CH.json @@ -60,7 +60,7 @@ }, "assistant": { "upload": { - "invalidFilesDescription": "Einige Dateien sind zu gross (>10 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien." + "invalidFilesDescription": "Einige Dateien sind zu gross (>100 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien." } } }, @@ -76,7 +76,7 @@ } }, "chat": { - "filesNotSupported": "Einige Dateien sind zu gross (>10 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien." + "filesNotSupported": "Einige Dateien sind zu gross (>100 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien." }, "documents": { "preview": { diff --git a/services/platform/messages/de.json b/services/platform/messages/de.json index 40a198199a..1e917ed42e 100644 --- a/services/platform/messages/de.json +++ b/services/platform/messages/de.json @@ -2113,7 +2113,7 @@ "upload": { "uploading": "Wird hochgeladen...", "invalidFiles": "Ungültige Dateien", - "invalidFilesDescription": "Einige Dateien sind zu groß (>10 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien.", + "invalidFilesDescription": "Einige Dateien sind zu groß (>100 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien.", "success": "Datei hochgeladen", "successDescription": "{fileName} erfolgreich hochgeladen", "failed": "Upload fehlgeschlagen", @@ -2626,13 +2626,15 @@ "placeholder": "Frag zu Kunden, Produkten oder Dokumenten...", "searchConversations": "Konversationen durchsuchen", "invalidFiles": "Ungültige Dateien", - "filesNotSupported": "Einige Dateien sind zu groß (>10 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien.", + "filesNotSupported": "Einige Dateien sind zu groß (>100 MB) oder werden nicht unterstützt. Unterstützt: Bilder, PDF, Word-Dokumente, Textdateien.", "fileUploaded": "Datei hochgeladen", "uploadedSuccessfully": "{filename} erfolgreich hochgeladen", "uploadFailed": "Upload fehlgeschlagen", "failedToUpload": "{filename} konnte nicht hochgeladen werden", "removeAttachment": "Anhang entfernen", "uploadingFile": "Datei wird hochgeladen", + "indexing": "Indizierung...", + "indexingFailed": "Indizierung fehlgeschlagen", "viewImage": "Bild anzeigen", "dropFilesToAdd": "Dateien hier ablegen, um sie zum Chat hinzuzufügen", "fileTypes": { diff --git a/services/platform/messages/en.json b/services/platform/messages/en.json index e4dd069df8..7cecfc247f 100644 --- a/services/platform/messages/en.json +++ b/services/platform/messages/en.json @@ -2117,7 +2117,7 @@ "upload": { "uploading": "Uploading...", "invalidFiles": "Invalid files", - "invalidFilesDescription": "Some files are too large (>10MB) or not supported. Supported: images, PDF, Word docs, text files.", + "invalidFilesDescription": "Some files are too large (>100MB) or not supported. Supported: images, PDF, Word docs, text files.", "success": "File uploaded", "successDescription": "{fileName} uploaded successfully", "failed": "Upload failed", @@ -2635,7 +2635,7 @@ "placeholder": "Ask about customers, products, or documents…", "searchConversations": "Search conversations", "invalidFiles": "Invalid files", - "filesNotSupported": "Some files are too large (>10MB) or not supported. Supported: images, PDF, Word docs, text files.", + "filesNotSupported": "Some files are too large (>100MB) or not supported. Supported: images, PDF, Word docs, text files.", "fileUploaded": "File uploaded", "uploadedSuccessfully": "{filename} uploaded successfully", "uploadFailed": "Upload failed", @@ -2648,6 +2648,8 @@ "duplicateFileDescription": "{filename} is already attached.", "removeAttachment": "Remove attachment", "uploadingFile": "Uploading file", + "indexing": "Indexing...", + "indexingFailed": "Index failed", "viewImage": "View image", "dropFilesToAdd": "Drop files here to add to chat", "fileTypes": { @@ -2737,7 +2739,9 @@ "viewDocument": "View in document", "visitPage": "Visit page", "showAllSources": "Show all {count} sources", - "hideSources": "Hide sources" + "hideSources": "Hide sources", + "chunkCount": "{count, plural, one {# chunk} other {# chunks}}", + "noContent": "Content not available" }, "feedback": { "helpful": "Helpful", diff --git a/services/rag/app/models.py b/services/rag/app/models.py index 9d1cc7b45f..8f504e09b8 100644 --- a/services/rag/app/models.py +++ b/services/rag/app/models.py @@ -134,6 +134,10 @@ class DocumentStatusInfo(BaseModel): status: str = Field(..., description="Document status: processing, completed, or failed") error: str | None = Field(default=None, description="Error message when status is failed") + progress_phase: str | None = Field( + default=None, description="Current processing phase: extracting, embedding, storing" + ) + progress_detail: str | None = Field(default=None, description="Phase detail, e.g. '12/50' for page progress") source_created_at: dt.datetime | None = Field( default=None, description="Original file creation date (from file metadata)" ) diff --git a/services/rag/app/routers/documents.py b/services/rag/app/routers/documents.py index eab4bc843c..0c40d54eb5 100644 --- a/services/rag/app/routers/documents.py +++ b/services/rag/app/routers/documents.py @@ -555,6 +555,8 @@ async def get_document_statuses(request: DocumentStatusRequest): did: DocumentStatusInfo( status=info["status"], error=info.get("error"), + progress_phase=info.get("progress_phase"), + progress_detail=info.get("progress_detail"), source_created_at=info.get("source_created_at"), source_modified_at=info.get("source_modified_at"), ) diff --git a/services/rag/app/services/indexing_service.py b/services/rag/app/services/indexing_service.py index 83963dedf0..2c8d9d063b 100644 --- a/services/rag/app/services/indexing_service.py +++ b/services/rag/app/services/indexing_service.py @@ -8,6 +8,7 @@ import datetime as dt import re +import time import uuid from dataclasses import dataclass, replace from io import BytesIO @@ -152,6 +153,53 @@ def _extract_file_dates( return (None, None) +async def _update_progress( + pool: asyncpg.Pool, + file_id: str, + phase: str, + detail: str, +) -> None: + """Write progress info to the documents table for status polling.""" + try: + async with acquire_with_retry(pool) as conn: + await conn.execute( + f"""UPDATE {SCHEMA}.documents + SET progress_phase = $2, progress_detail = $3, updated_at = NOW() + WHERE file_id = $1 AND status = 'processing'""", + file_id, + phase, + detail, + ) + except Exception: + logger.debug("Failed to update progress for {}", file_id) + + +def _make_extraction_progress_callback( + pool: asyncpg.Pool, + file_id: str, + loop: Any, + *, + min_interval: float = 3.0, +) -> Any: + """Create a throttled progress callback for page extraction. + + Writes to DB at most once per ``min_interval`` seconds to avoid + overwhelming the database during concurrent page processing. + """ + last_flush = 0.0 + + def on_progress(pages_done: int, total_pages: int) -> None: + nonlocal last_flush + now = time.monotonic() + if now - last_flush < min_interval and pages_done < total_pages: + return + last_flush = now + detail = f"{pages_done}/{total_pages}" + loop.create_task(_update_progress(pool, file_id, "extracting", detail)) + + return on_progress + + async def prepare_document( content_bytes: bytes, filename: str, @@ -160,6 +208,7 @@ async def prepare_document( vision_client: VisionClient | None = None, chunk_size: int = 2048, chunk_overlap: int = 200, + on_progress: Any = None, ) -> PreparedDocument | None: """Extract, chunk, and embed a document (expensive work done once). @@ -172,6 +221,7 @@ async def prepare_document( content_bytes, filename, vision_client=vision_client, + on_progress=on_progress, ) except UnicodeDecodeError: raise ValueError( @@ -509,6 +559,13 @@ async def index_document( return result logger.warning("Clone source {} vanished, falling back to full processing", source_id) + import asyncio as _aio + + loop = _aio.get_running_loop() + extraction_cb = _make_extraction_progress_callback(pool, file_id, loop) + + await _update_progress(pool, file_id, "extracting", "") + prepared = await prepare_document( content_bytes, filename, @@ -516,6 +573,7 @@ async def index_document( vision_client=vision_client, chunk_size=chunk_size, chunk_overlap=chunk_overlap, + on_progress=extraction_cb, ) if prepared is None: @@ -527,6 +585,13 @@ async def index_document( "skip_reason": "no_text_extracted", } + await _update_progress( + pool, + file_id, + "embedding", + f"{len(prepared.chunks)} chunks", + ) + if source_created_at is not None or source_modified_at is not None: prepared = replace( prepared, @@ -534,6 +599,8 @@ async def index_document( source_modified_at=source_modified_at or prepared.source_modified_at, ) + await _update_progress(pool, file_id, "storing", "") + return await store_prepared_document( pool, file_id, diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py index d2aa444689..20c445150f 100644 --- a/services/rag/app/services/rag_service.py +++ b/services/rag/app/services/rag_service.py @@ -303,6 +303,22 @@ async def search( if threshold > 0: results = [r for r in results if r.get("score", 0) >= threshold] + # If no results and some files are still indexing, wait and retry once + if not results and file_ids: + statuses = await self.get_document_statuses(file_ids) + has_processing = any(s is not None and s.get("status") == "processing" for s in statuses.values()) + if has_processing: + logger.info("No results and some files still indexing, retrying in 3s") + await asyncio.sleep(3) + results = await self._search_service.search( + query, + file_ids=file_ids, + top_k=effective_top_k, + ) + self.last_search_usage = getattr(self._search_service, "last_search_usage", None) + if threshold > 0: + results = [r for r in results if r.get("score", 0) >= threshold] + return results async def generate( @@ -509,7 +525,8 @@ async def get_document_statuses( rows = await conn.fetch( f""" SELECT DISTINCT ON (file_id) - file_id, status, error, source_created_at, source_modified_at + file_id, status, error, progress_phase, progress_detail, + source_created_at, source_modified_at FROM {SCHEMA}.documents WHERE file_id = ANY($1) ORDER BY file_id, @@ -528,6 +545,8 @@ async def get_document_statuses( row["file_id"]: { "status": row["status"], "error": row["error"], + "progress_phase": row["progress_phase"], + "progress_detail": row["progress_detail"], "source_created_at": row["source_created_at"], "source_modified_at": row["source_modified_at"], } diff --git a/services/rag/tests/test_background_ingest.py b/services/rag/tests/test_background_ingest.py index 30c5dc8104..b4f00f8102 100644 --- a/services/rag/tests/test_background_ingest.py +++ b/services/rag/tests/test_background_ingest.py @@ -70,6 +70,8 @@ async def test_returns_status_for_found_documents(self): "file_id": "doc-1", "status": "completed", "error": None, + "progress_phase": None, + "progress_detail": None, "source_created_at": None, "source_modified_at": None, }, @@ -77,6 +79,8 @@ async def test_returns_status_for_found_documents(self): "file_id": "doc-2", "status": "processing", "error": None, + "progress_phase": None, + "progress_detail": None, "source_created_at": None, "source_modified_at": None, }, @@ -103,6 +107,8 @@ async def test_returns_error_field_for_failed_documents(self): "file_id": "doc-1", "status": "failed", "error": "Embedding failed", + "progress_phase": None, + "progress_detail": None, "source_created_at": None, "source_modified_at": None, }, diff --git a/services/rag/tests/test_indexing_service.py b/services/rag/tests/test_indexing_service.py index 71ae6e3cfa..08fec12ca8 100644 --- a/services/rag/tests/test_indexing_service.py +++ b/services/rag/tests/test_indexing_service.py @@ -211,11 +211,10 @@ async def test_passes_vision_client_to_extract(self): vision_client=mock_vision, ) - mock_extract.assert_awaited_once_with( - SAMPLE_CONTENT, - SAMPLE_FILENAME, - vision_client=mock_vision, - ) + call_kwargs = mock_extract.call_args + assert call_kwargs.args == (SAMPLE_CONTENT, SAMPLE_FILENAME) + assert call_kwargs.kwargs["vision_client"] is mock_vision + assert "on_progress" in call_kwargs.kwargs async def test_custom_chunk_size_and_overlap(self): from app.services.indexing_service import index_document diff --git a/services/rag/tests/test_rag_service.py b/services/rag/tests/test_rag_service.py index 4f70ed7492..655bf72e56 100644 --- a/services/rag/tests/test_rag_service.py +++ b/services/rag/tests/test_rag_service.py @@ -231,6 +231,7 @@ async def test_zero_threshold_returns_all(self): async def test_passes_file_ids(self): service = _make_service() service._search_service.search = AsyncMock(return_value=[]) + service.get_document_statuses = AsyncMock(return_value={"doc-1": None, "doc-2": None}) with patch("app.services.rag_service.settings") as mock_settings: mock_settings.top_k = 10