Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions services/crawler/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
image_router,
pdf_router,
pptx_router,
web_router,
)
from app.services.crawler_service import get_crawler_service
from app.services.image_service import get_image_service
Expand Down Expand Up @@ -93,6 +94,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
app.include_router(image_router)
app.include_router(docx_router)
app.include_router(pptx_router)
app.include_router(web_router)


@app.get("/health", response_model=HealthResponse)
Expand Down
24 changes: 24 additions & 0 deletions services/crawler/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,27 @@ class ParseFileResponse(BaseModel):
metadata: dict[str, Any] | None = Field(None, description="Document metadata")
vision_used: bool | None = Field(None, description="Whether Vision API was used")
error: str | None = Field(None, description="Error message if parsing failed")


# ==================== Web Fetch & Extract Models ====================


class WebFetchExtractRequest(BaseModel):
"""Request to fetch URL and extract content."""

url: HttpUrl = Field(..., description="URL to fetch and extract content from")
instruction: str | None = Field(None, description="Optional AI instruction for content extraction")
timeout: int = Field(60000, description="Navigation timeout in ms (default: 60s)", ge=5000, le=120000)


class WebFetchExtractResponse(BaseModel):
"""Response from web fetch and extract operation."""

success: bool = Field(..., description="Whether the operation was successful")
url: str = Field(..., description="The fetched URL")
title: str | None = Field(None, description="Page title")
content: str = Field(..., description="Extracted text content")
word_count: int = Field(..., description="Number of words in content")
page_count: int = Field(..., description="Number of pages in PDF")
vision_used: bool = Field(False, description="Whether Vision API was used for extraction")
error: str | None = Field(None, description="Error message if operation failed")
3 changes: 3 additions & 0 deletions services/crawler/app/routers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@
- image: Image conversion (/api/v1/images)
- docx: DOCX document generation and parsing (/api/v1/docx)
- pptx: PPTX template generation and parsing (/api/v1/pptx)
- web: Web fetch and extract (/api/v1/web)
"""

from app.routers.crawler import router as crawler_router
from app.routers.docx import router as docx_router
from app.routers.image import router as image_router
from app.routers.pdf import router as pdf_router
from app.routers.pptx import router as pptx_router
from app.routers.web import router as web_router

__all__ = [
"crawler_router",
"docx_router",
"image_router",
"pdf_router",
"pptx_router",
"web_router",
]
147 changes: 147 additions & 0 deletions services/crawler/app/routers/web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
Web Router - URL content extraction endpoint.

Combines URL-to-PDF conversion with Vision-based text extraction.
"""

import socket
from ipaddress import ip_address
from urllib.parse import urlparse

from fastapi import APIRouter, HTTPException, status
from loguru import logger

from app.models import WebFetchExtractRequest, WebFetchExtractResponse
from app.services.file_parser_service import get_file_parser_service
from app.services.pdf_service import get_pdf_service

router = APIRouter(prefix="/api/v1/web", tags=["Web"])


def validate_url_not_private(url_str: str) -> str:
"""
Validate that a URL does not resolve to a private/internal IP address.

Prevents SSRF attacks by blocking requests to loopback, link-local,
and private RFC1918/IPv6 addresses.

Args:
url_str: The URL to validate

Returns:
The hostname if validation passes

Raises:
HTTPException: If the URL host cannot be resolved or resolves to a private IP
"""
parsed_url = urlparse(url_str)
hostname = parsed_url.hostname or ""

if not hostname:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid URL: no hostname found",
)

try:
resolved_ips = {
ip_address(info[4][0])
for info in socket.getaddrinfo(hostname, None)
}
except socket.gaierror:
logger.warning(f"SSRF protection: unable to resolve hostname '{hostname}'")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Unable to resolve URL host",
)

blocked_ips = [
ip for ip in resolved_ips
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved
]

if blocked_ips:
logger.warning(
f"SSRF protection: blocked request to '{hostname}' "
f"(resolved to private/internal IPs: {blocked_ips})"
)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="URL host is not allowed (resolves to private/internal address)",
)

return hostname


@router.post("/fetch-and-extract", response_model=WebFetchExtractResponse)
async def fetch_and_extract(request: WebFetchExtractRequest):
"""
Fetch a URL, convert to PDF, and extract text content.

Pipeline:
1. Navigate to URL with Playwright and render as PDF
2. Extract text using PyMuPDF + Vision API (handles images, OCR for scanned content)

Args:
request: URL and optional extraction instruction

Returns:
Extracted content with metadata
"""
url_str = str(request.url)
hostname = validate_url_not_private(url_str)

try:
pdf_service = get_pdf_service()
if not pdf_service.initialized:
await pdf_service.initialize()

logger.info(f"Fetching URL as PDF: {url_str}")
pdf_bytes = await pdf_service.url_to_pdf(
url=url_str,
timeout=request.timeout,
)

logger.info(f"Extracting content from PDF ({len(pdf_bytes)} bytes)")
parser = get_file_parser_service()
result = await parser.parse_pdf_with_vision(
pdf_bytes,
filename=f"{hostname}.pdf",
user_input=request.instruction,
process_images=True,
ocr_scanned_pages=True,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if not result.get("success"):
return WebFetchExtractResponse(
success=False,
url=url_str,
content="",
word_count=0,
page_count=0,
error=result.get("error", "Failed to extract content from PDF"),
)

full_text = result.get("full_text", "")
word_count = len(full_text.split()) if full_text else 0

logger.info(
f"Content extracted: {word_count} words, {result.get('page_count', 0)} pages, "
f"vision_used={result.get('vision_used', False)}"
)

return WebFetchExtractResponse(
success=True,
url=url_str,
content=full_text,
word_count=word_count,
page_count=result.get("page_count", 0),
vision_used=result.get("vision_used", False),
)

except Exception:
logger.exception(f"Error fetching and extracting URL: {url_str}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to fetch and extract content from URL: {url_str}",
) from None
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export function ChatInterface({
streamingMessage,
pendingToolResponse,
hasActiveTools,
isProcessingToolResult,
} = useMessageProcessing(threadId);

// Merge with pending messages from context for optimistic UI
Expand Down Expand Up @@ -221,6 +222,7 @@ export function ChatInterface({
streamingMessage={streamingMessage}
pendingToolResponse={pendingToolResponse}
hasActiveTools={hasActiveTools}
isProcessingToolResult={isProcessingToolResult}
aiResponseAreaRef={aiResponseAreaRef}
onHumanInputResponseSubmitted={handleHumanInputResponseSubmitted}
/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ interface ChatMessagesProps {
streamingMessage: UIMessage | undefined;
pendingToolResponse: UIMessage | undefined;
hasActiveTools: boolean;
isProcessingToolResult: boolean;
aiResponseAreaRef: RefObject<HTMLDivElement | null>;
onHumanInputResponseSubmitted?: () => void;
}
Expand All @@ -41,6 +42,7 @@ export function ChatMessages({
streamingMessage,
pendingToolResponse,
hasActiveTools,
isProcessingToolResult,
aiResponseAreaRef,
onHumanInputResponseSubmitted,
}: ChatMessagesProps) {
Expand Down Expand Up @@ -173,6 +175,7 @@ export function ChatMessages({
(streamingMessage?.status === 'streaming' &&
!streamingMessage.text) ||
hasActiveTools ||
isProcessingToolResult ||
!!pendingToolResponse) && (
<ThinkingAnimation streamingMessage={streamingMessage} />
)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,19 @@ export function ThinkingAnimation({ streamingMessage }: ThinkingAnimationProps)
toolName: string,
input?: Record<string, unknown>,
): ToolDetail => {
if (toolName === 'web_read' && input) {
if (input.operation === 'search' && input.query) {
if (toolName === 'web' && input) {
if (input.operation === 'fetch_url' && input.url) {
return {
toolName,
displayText: t('thinking.searching', {
query: truncate(String(input.query), 30),
displayText: t('thinking.reading', {
hostname: extractHostname(String(input.url)),
}),
};
}
if (input.operation === 'fetch_url' && input.url) {
if (input.operation === 'browser_operate' && input.instruction) {
return {
toolName,
displayText: t('thinking.reading', {
hostname: extractHostname(String(input.url)),
}),
displayText: t('thinking.browsing'),
};
}
}
Expand All @@ -66,7 +64,7 @@ export function ThinkingAnimation({ streamingMessage }: ThinkingAnimationProps)
customer_read: t('tools.customerRead'),
product_read: t('tools.productRead'),
rag_search: t('tools.ragSearch'),
web_read: t('tools.webRead'),
web: t('tools.web'),
pdf: t('tools.pdf'),
image: t('tools.image'),
pptx: t('tools.pptx'),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ interface UseMessageProcessingResult {
streamingMessage: UIMessage | undefined;
pendingToolResponse: UIMessage | undefined;
hasActiveTools: boolean;
isProcessingToolResult: boolean;
}

/**
Expand Down Expand Up @@ -151,6 +152,39 @@ export function useMessageProcessing(
);
}, [streamingMessage?.parts]);

// Check if agent is processing tool result (tool completed but no text after it)
// This handles the gap when sub-agent tools complete but agent hasn't resumed streaming
const isProcessingToolResult = useMemo(() => {
if (!uiMessages?.length) return false;

const lastAssistant = uiMessages
.filter((m) => m.role === 'assistant')
.at(-1);
if (!lastAssistant?.parts?.length) return false;
if (lastAssistant.status === 'success' || lastAssistant.status === 'failed')
return false;

const lastToolIndex = lastAssistant.parts.findLastIndex(
(part: { type: string }) =>
part.type.startsWith('tool-') && part.type !== 'tool-result',
);
if (lastToolIndex === -1) return false;

const lastToolPart = lastAssistant.parts[lastToolIndex] as {
type: string;
state?: string;
};
if (lastToolPart?.state !== 'output-available') return false;

const partsAfterTool = lastAssistant.parts.slice(lastToolIndex + 1);
const hasTextAfterTool = partsAfterTool.some(
(part: { type: string; text?: string }) =>
part.type === 'text' && part.text,
);

return !hasTextAfterTool;
}, [uiMessages]);

return {
messages,
uiMessages,
Expand All @@ -160,5 +194,6 @@ export function useMessageProcessing(
streamingMessage,
pendingToolResponse,
hasActiveTools,
isProcessingToolResult,
};
}
Loading